Total coverage: 176935 (15%)of 1222136
5 5 5 4 242 5 1186 498 497 50 49 6 7 2682 2687 2683 2683 2681 2684 2680 2679 2685 2680 329 330 1797 469 1300 6 34 3 1 1328 6 955 954 98 28 12 829 143 38 108 922 37 952 953 86 42 833 29 15 914 955 390 391 160 158 119 168 168 168 168 168 168 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 // SPDX-License-Identifier: GPL-2.0-or-later /* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "CRED: " fmt #include <linux/export.h> #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/coredump.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/init_task.h> #include <linux/security.h> #include <linux/binfmts.h> #include <linux/cn_proc.h> #include <linux/uidgid.h> #if 0 #define kdebug(FMT, ...) \ printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__) #else #define kdebug(FMT, ...) \ do { \ if (0) \ no_printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__); \ } while (0) #endif static struct kmem_cache *cred_jar; /* init to 2 - one for init_task, one to ensure it is never freed */ static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; /* * The initial credentials for the initial task */ struct cred init_cred = { .usage = ATOMIC_INIT(4), .uid = GLOBAL_ROOT_UID, .gid = GLOBAL_ROOT_GID, .suid = GLOBAL_ROOT_UID, .sgid = GLOBAL_ROOT_GID, .euid = GLOBAL_ROOT_UID, .egid = GLOBAL_ROOT_GID, .fsuid = GLOBAL_ROOT_UID, .fsgid = GLOBAL_ROOT_GID, .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, .cap_effective = CAP_FULL_SET, .cap_bset = CAP_FULL_SET, .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, .ucounts = &init_ucounts, }; /* * The RCU callback to actually dispose of a set of credentials */ static void put_cred_rcu(struct rcu_head *rcu) { struct cred *cred = container_of(rcu, struct cred, rcu); kdebug("put_cred_rcu(%p)", cred); if (atomic_long_read(&cred->usage) != 0) panic("CRED: put_cred_rcu() sees %p with usage %ld\n", cred, atomic_long_read(&cred->usage)); security_cred_free(cred); key_put(cred->session_keyring); key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); if (cred->ucounts) put_ucounts(cred->ucounts); put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } /** * __put_cred - Destroy a set of credentials * @cred: The record to release * * Destroy a set of credentials on which no references remain. */ void __put_cred(struct cred *cred) { kdebug("__put_cred(%p{%ld})", cred, atomic_long_read(&cred->usage)); BUG_ON(atomic_long_read(&cred->usage) != 0); BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); if (cred->non_rcu) put_cred_rcu(&cred->rcu); else call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); /* * Clean up a task's credentials when it exits */ void exit_creds(struct task_struct *tsk) { struct cred *real_cred, *cred; kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred, atomic_long_read(&tsk->cred->usage)); real_cred = (struct cred *) tsk->real_cred; tsk->real_cred = NULL; cred = (struct cred *) tsk->cred; tsk->cred = NULL; if (real_cred == cred) { put_cred_many(cred, 2); } else { put_cred(real_cred); put_cred(cred); } #ifdef CONFIG_KEYS_REQUEST_CACHE key_put(tsk->cached_requested_key); tsk->cached_requested_key = NULL; #endif } /** * get_task_cred - Get another task's objective credentials * @task: The task to query * * Get the objective credentials of a task, pinning them so that they can't go * away. Accessing a task's credentials directly is not permitted. * * The caller must also make sure task doesn't get deleted, either by holding a * ref on task or by holding tasklist_lock to prevent it from being unlinked. */ const struct cred *get_task_cred(struct task_struct *task) { const struct cred *cred; rcu_read_lock(); do { cred = __task_cred((task)); BUG_ON(!cred); } while (!get_cred_rcu(cred)); rcu_read_unlock(); return cred; } EXPORT_SYMBOL(get_task_cred); /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. */ struct cred *cred_alloc_blank(void) { struct cred *new; new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); if (!new) return NULL; atomic_long_set(&new->usage, 1); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; error: abort_creds(new); return NULL; } /** * prepare_creds - Prepare a new set of credentials for modification * * Prepare a new set of task credentials for modification. A task's creds * shouldn't generally be modified directly, therefore this function is used to * prepare a new copy, which the caller then modifies and then commits by * calling commit_creds(). * * Preparation involves making a copy of the objective creds for modification. * * Returns a pointer to the new creds-to-be if successful, NULL otherwise. * * Call commit_creds() or abort_creds() to clean up. */ struct cred *prepare_creds(void) { struct task_struct *task = current; const struct cred *old; struct cred *new; new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_creds() alloc %p", new); old = task->cred; memcpy(new, old, sizeof(struct cred)); new->non_rcu = 0; atomic_long_set(&new->usage, 1); get_group_info(new->group_info); get_uid(new->user); get_user_ns(new->user_ns); #ifdef CONFIG_KEYS key_get(new->session_keyring); key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; error: abort_creds(new); return NULL; } EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() * - The caller must hold ->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { struct cred *new; new = prepare_creds(); if (!new) return new; #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; /* inherit the session keyring; new process keyring */ key_put(new->process_keyring); new->process_keyring = NULL; #endif new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; return new; } /* * Copy credentials for the new process created by fork() * * We share if we can, but under some circumstances we have to generate a new * set. * * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { struct cred *new; int ret; #ifdef CONFIG_KEYS_REQUEST_CACHE p->cached_requested_key = NULL; #endif if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && #endif clone_flags & CLONE_THREAD ) { p->real_cred = get_cred_many(p->cred, 2); kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } new = prepare_creds(); if (!new) return -ENOMEM; if (clone_flags & CLONE_NEWUSER) { ret = create_user_ns(new); if (ret < 0) goto error_put; ret = set_cred_ucounts(new); if (ret < 0) goto error_put; } #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ if (new->thread_keyring) { key_put(new->thread_keyring); new->thread_keyring = NULL; if (clone_flags & CLONE_THREAD) install_thread_keyring_to_cred(new); } /* The process keyring is only shared between the threads in a process; * anything outside of those threads doesn't inherit. */ if (!(clone_flags & CLONE_THREAD)) { key_put(new->process_keyring); new->process_keyring = NULL; } #endif p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; error_put: put_cred(new); return ret; } static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) { const struct user_namespace *set_ns = set->user_ns; const struct user_namespace *subset_ns = subset->user_ns; /* If the two credentials are in the same user namespace see if * the capabilities of subset are a subset of set. */ if (set_ns == subset_ns) return cap_issubset(subset->cap_permitted, set->cap_permitted); /* The credentials are in a different user namespaces * therefore one is a subset of the other only if a set is an * ancestor of subset and set->euid is owner of subset or one * of subsets ancestors. */ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { if ((set_ns == subset_ns->parent) && uid_eq(subset_ns->owner, set->euid)) return true; } return false; } /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned * * Install a new set of credentials to the current task, using RCU to replace * the old set. Both the objective and the subjective credentials pointers are * updated. This function may not be called if the subjective credentials are * in an overridden state. * * This function eats the caller's reference to the new credentials. * * Always returns 0 thus allowing this function to be tail-called at the end * of, say, sys_setgid(). */ int commit_creds(struct cred *new) { struct task_struct *task = current; const struct cred *old = task->real_cred; kdebug("commit_creds(%p{%ld})", new, atomic_long_read(&new->usage)); BUG_ON(task->cred != old); BUG_ON(atomic_long_read(&new->usage) < 1); get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ if (!uid_eq(old->euid, new->euid) || !gid_eq(old->egid, new->egid) || !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || !cred_cap_issubset(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; /* * If a task drops privileges and becomes nondumpable, * the dumpability change must become visible before * the credential change; otherwise, a __ptrace_may_access() * racing with this change may be able to attach to a task it * shouldn't be able to attach to (as if the task had dropped * privileges without becoming nondumpable). * Pairs with a read barrier in __ptrace_may_access(). */ smp_wmb(); } /* alter the thread keyring */ if (!uid_eq(new->fsuid, old->fsuid)) key_fsuid_changed(new); if (!gid_eq(new->fsgid, old->fsgid)) key_fsgid_changed(new); /* do it * RLIMIT_NPROC limits on user->processes have already been checked * in set_user(). */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); /* send notifications */ if (!uid_eq(new->uid, old->uid) || !uid_eq(new->euid, old->euid) || !uid_eq(new->suid, old->suid) || !uid_eq(new->fsuid, old->fsuid)) proc_id_connector(task, PROC_EVENT_UID); if (!gid_eq(new->gid, old->gid) || !gid_eq(new->egid, old->egid) || !gid_eq(new->sgid, old->sgid) || !gid_eq(new->fsgid, old->fsgid)) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ put_cred_many(old, 2); return 0; } EXPORT_SYMBOL(commit_creds); /** * abort_creds - Discard a set of credentials and unlock the current task * @new: The credentials that were going to be applied * * Discard a set of credentials that were under construction and unlock the * current task. */ void abort_creds(struct cred *new) { kdebug("abort_creds(%p{%ld})", new, atomic_long_read(&new->usage)); BUG_ON(atomic_long_read(&new->usage) < 1); put_cred(new); } EXPORT_SYMBOL(abort_creds); /** * cred_fscmp - Compare two credentials with respect to filesystem access. * @a: The first credential * @b: The second credential * * cred_cmp() will return zero if both credentials have the same * fsuid, fsgid, and supplementary groups. That is, if they will both * provide the same access to files based on mode/uid/gid. * If the credentials are different, then either -1 or 1 will * be returned depending on whether @a comes before or after @b * respectively in an arbitrary, but stable, ordering of credentials. * * Return: -1, 0, or 1 depending on comparison */ int cred_fscmp(const struct cred *a, const struct cred *b) { struct group_info *ga, *gb; int g; if (a == b) return 0; if (uid_lt(a->fsuid, b->fsuid)) return -1; if (uid_gt(a->fsuid, b->fsuid)) return 1; if (gid_lt(a->fsgid, b->fsgid)) return -1; if (gid_gt(a->fsgid, b->fsgid)) return 1; ga = a->group_info; gb = b->group_info; if (ga == gb) return 0; if (ga == NULL) return -1; if (gb == NULL) return 1; if (ga->ngroups < gb->ngroups) return -1; if (ga->ngroups > gb->ngroups) return 1; for (g = 0; g < ga->ngroups; g++) { if (gid_lt(ga->gid[g], gb->gid[g])) return -1; if (gid_gt(ga->gid[g], gb->gid[g])) return 1; } return 0; } EXPORT_SYMBOL(cred_fscmp); int set_cred_ucounts(struct cred *new) { struct ucounts *new_ucounts, *old_ucounts = new->ucounts; /* * This optimization is needed because alloc_ucounts() uses locks * for table lookups. */ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid)) return 0; if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid))) return -EAGAIN; new->ucounts = new_ucounts; put_ucounts(old_ucounts); return 0; } /* * initialise the credentials stuff */ void __init cred_init(void) { /* allocate a slab in which we can store credentials */ cred_jar = KMEM_CACHE(cred, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } /** * prepare_kernel_cred - Prepare a set of credentials for a kernel service * @daemon: A userspace daemon to be used as a reference * * Prepare a set of credentials for a kernel service. This can then be used to * override a task's own credentials so that work can be done on behalf of that * task that requires a different subjective context. * * @daemon is used to provide a base cred, with the security data derived from * that; if this is "&init_task", they'll be set to 0, no groups, full * capabilities, and no keys. * * The caller may change these controls afterwards if desired. * * Returns the new credentials or NULL if out of memory. */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { const struct cred *old; struct cred *new; if (WARN_ON_ONCE(!daemon)) return NULL; new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_kernel_cred() alloc %p", new); old = get_task_cred(daemon); *new = *old; new->non_rcu = 0; atomic_long_set(&new->usage, 1); get_uid(new->user); get_user_ns(new->user_ns); get_group_info(new->group_info); #ifdef CONFIG_KEYS new->session_keyring = NULL; new->process_keyring = NULL; new->thread_keyring = NULL; new->request_key_auth = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; put_cred(old); return new; error: put_cred(new); put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); /** * set_security_override - Set the security ID in a set of credentials * @new: The credentials to alter * @secid: The LSM security ID to set * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. */ int set_security_override(struct cred *new, u32 secid) { return security_kernel_act_as(new, secid); } EXPORT_SYMBOL(set_security_override); /** * set_security_override_from_ctx - Set the security ID in a set of credentials * @new: The credentials to alter * @secctx: The LSM security context to generate the security ID from. * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. The * security ID is specified in string form as a security context to be * interpreted by the LSM. */ int set_security_override_from_ctx(struct cred *new, const char *secctx) { u32 secid; int ret; ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); if (ret < 0) return ret; return set_security_override(new, secid); } EXPORT_SYMBOL(set_security_override_from_ctx); /** * set_create_files_as - Set the LSM file create context in a set of credentials * @new: The credentials to alter * @inode: The inode to take the context from * * Change the LSM file creation context in a set of credentials to be the same * as the object context of the specified inode, so that the new inodes have * the same MAC context as that inode. */ int set_create_files_as(struct cred *new, struct inode *inode) { if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as);
160 105 3 2 80 98 80 81 5 2 3 5 2 2 155 155 155 131 131 1 2 131 5 1 1 1 5 5 3 3 3 16 16 15 26 14 1 1 37 178 178 36 169 7 105 112 31 105 26 26 131 36 7 7 26 38 1 109 145 36 112 145 156 46 154 2 156 46 46 156 2 2 2 156 270 270 270 270 264 4 6 262 5 4 1 262 5 5 262 262 266 253 15 283 2853 360 2843 21 1 7 7 6 12 3 5 5 1 380 1 253 253 156 156 187 187 119 384 381 384 384 381 382 382 381 381 380 380 379 381 380 382 382 381 2 375 260 7 258 28 5 29 28 4 29 30 1 29 3 2 1 9 9 9 9 5 9 1 155 155 254 121 136 14 35 36 24 24 24 4 14 24 1 214 207 55 7 48 48 33 419 2257 2259 233 4 1922 92 157 107 4 4 1 1 46 58 1 1922 158 158 8 155 4 55 4 93 93 92 66 2632 2252 1 369 3 2368 254 234 2383 2612 456 1945 1926 215 452 41 631 7 622 23 4 7 13 210 2308 2517 2520 2507 92 2490 5 41 2490 5 442 1 173 1 2 2258 2436 2426 5 42 42 480 505 46 2640 2635 2622 2620 1 1 1 23 25 4 2 30 12 42 20 20 20 18 31 3 3 3 38 56 49 49 3 1 49 48 3 57 1 56 53 1 48 61 61 57 18 39 56 4 160 5 155 155 155 254 811 863 1 863 863 1 1 1 617 253 2493 2 2494 5 2 276 207 1 71 2 25 16 2 1 2 1 2 11 1 121 137 236 21 30 209 238 238 3 3 7 2 2 3 3 2 4 3 3 4 11 10 1 3 2 1 3 6 26 6 9 1 3 2 6 31 24 2 1 1 3 432 314 8 321 151 2 40 2 1 391 41 248 1 1 1 2 6 147 2 1 1 2 3 2 3 3 2 1 7 1 3 1 4 2 1 31 2 7 2 21 2 4 2 1 3 2 1 11 2 3 2 1 7 2 1 3 1 1 13 7 147 431 30 2 28 26 26 145 145 2 2 1 169 61 2 1 1 85 14 6 2 2 81 81 81 4574 4005 780 81 187 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 // SPDX-License-Identifier: GPL-2.0-or-later /* * TUN - Universal TUN/TAP device driver. * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> * * $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $ */ /* * Changes: * * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14 * Add TUNSETLINK ioctl to set the link encapsulation * * Mark Smith <markzzzsmith@yahoo.com.au> * Use eth_random_addr() for tap MAC address. * * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 * Fixes in packet dropping, queue length setting and queue wakeup. * Increased default tx queue length. * Added ethtool API. * Minor cleanups * * Daniel Podlejski <underley@underley.eu.org> * Modifications for 2.3.99-pre5 kernel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DRV_NAME "tun" #define DRV_VERSION "1.6" #define DRV_DESCRIPTION "Universal TUN/TAP device driver" #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/miscdevice.h> #include <linux/ethtool.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_tun.h> #include <linux/if_vlan.h> #include <linux/crc32.h> #include <linux/math.h> #include <linux/nsproxy.h> #include <linux/virtio_net.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <net/ip_tunnels.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/mutex.h> #include <linux/ieee802154.h> #include <uapi/linux/if_ltalk.h> #include <uapi/linux/if_fddi.h> #include <uapi/linux/if_hippi.h> #include <uapi/linux/if_fc.h> #include <net/ax25.h> #include <net/rose.h> #include <net/6lowpan.h> #include <net/rps.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> #include "tun_vnet.h" static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* TUN device flags */ /* IFF_ATTACH_QUEUE is never stored in device flags, * overload it to mean fasync when stored there. */ #define TUN_FASYNC IFF_ATTACH_QUEUE #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 struct tap_filter { unsigned int count; /* Number of addrs. Zero means disabled */ u32 mask[2]; /* Mask of the hashed addrs */ unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; /* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal * to max number of VCPUs in guest. */ #define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and * tap_filter were kept in tun_struct since they were used for filtering for the * netdevice not for a specific queue (at least I didn't see the requirement for * this). * * RCU usage: * The tun_file and tun_struct are loosely coupled, the pointer from one to the * other can only be read while rcu_read_lock or rtnl_lock is held. */ struct tun_file { struct sock sk; struct socket socket; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; union { u16 queue_index; unsigned int ifindex; }; struct napi_struct napi; bool napi_enabled; bool napi_frags_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; }; struct tun_page { struct page *page; int count; }; struct tun_flow_entry { struct hlist_node hash_link; struct rcu_head rcu; struct tun_struct *tun; u32 rxhash; u32 rps_rxhash; int queue_index; unsigned long updated ____cacheline_aligned_in_smp; }; #define TUN_NUM_FLOW_ENTRIES 1024 #define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1) struct tun_prog { struct rcu_head rcu; struct bpf_prog *prog; }; /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. */ struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4 | \ NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM) int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; u32 msg_enable; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; u32 flow_count; u32 rx_batched; atomic_long_t rx_frame_errors; struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; struct ifreq *ifr; }; struct veth { __be16 h_vlan_proto; __be16 h_vlan_TCI; }; static void tun_flow_init(struct tun_struct *tun); static void tun_flow_uninit(struct tun_struct *tun); static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; struct sk_buff *skb; int received = 0; __skb_queue_head_init(&process_queue); spin_lock(&queue->lock); skb_queue_splice_tail_init(queue, &process_queue); spin_unlock(&queue->lock); while (received < budget && (skb = __skb_dequeue(&process_queue))) { napi_gro_receive(napi, skb); ++received; } if (!skb_queue_empty(&process_queue)) { spin_lock(&queue->lock); skb_queue_splice(&process_queue, queue); spin_unlock(&queue->lock); } return received; } static int tun_napi_poll(struct napi_struct *napi, int budget) { unsigned int received; received = tun_napi_receive(napi, budget); if (received < budget) napi_complete_done(napi, received); return received; } static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, bool napi_en, bool napi_frags) { tfile->napi_enabled = napi_en; tfile->napi_frags_enabled = napi_en && napi_frags; if (napi_en) { netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll); napi_enable(&tfile->napi); } } static void tun_napi_enable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_enable(&tfile->napi); } static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_disable(&tfile->napi); } static void tun_napi_del(struct tun_file *tfile) { if (tfile->napi_enabled) netif_napi_del(&tfile->napi); } static bool tun_napi_frags_enabled(const struct tun_file *tfile) { return tfile->napi_frags_enabled; } static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; } static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) { struct tun_flow_entry *e; hlist_for_each_entry_rcu(e, head, hash_link) { if (e->rxhash == rxhash) return e; } return NULL; } static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, struct hlist_head *head, u32 rxhash, u16 queue_index) { struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e) { netif_info(tun, tx_queued, tun->dev, "create flow: hash %u index %u\n", rxhash, queue_index); e->updated = jiffies; e->rxhash = rxhash; e->rps_rxhash = 0; e->queue_index = queue_index; e->tun = tun; hlist_add_head_rcu(&e->hash_link, head); ++tun->flow_count; } return e; } static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; } static void tun_flow_flush(struct tun_struct *tun) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) tun_flow_delete(tun, e); } spin_unlock_bh(&tun->lock); } static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { if (e->queue_index == queue_index) tun_flow_delete(tun, e); } } spin_unlock_bh(&tun->lock); } static void tun_flow_cleanup(struct timer_list *t) { struct tun_struct *tun = timer_container_of(tun, t, flow_gc_timer); unsigned long delay = tun->ageing_time; unsigned long next_timer = jiffies + delay; unsigned long count = 0; int i; spin_lock(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { unsigned long this_timer; this_timer = e->updated + delay; if (time_before_eq(this_timer, jiffies)) { tun_flow_delete(tun, e); continue; } count++; if (time_before(this_timer, next_timer)) next_timer = this_timer; } } if (count) mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); spin_unlock(&tun->lock); } static void tun_flow_update(struct tun_struct *tun, u32 rxhash, struct tun_file *tfile) { struct hlist_head *head; struct tun_flow_entry *e; unsigned long delay = tun->ageing_time; u16 queue_index = tfile->queue_index; head = &tun->flows[tun_hashfn(rxhash)]; rcu_read_lock(); e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ if (READ_ONCE(e->queue_index) != queue_index) WRITE_ONCE(e->queue_index, queue_index); if (e->updated != jiffies) e->updated = jiffies; sock_rps_record_flow_hash(e->rps_rxhash); } else { spin_lock_bh(&tun->lock); if (!tun_flow_find(head, rxhash) && tun->flow_count < MAX_TAP_FLOWS) tun_flow_create(tun, head, rxhash, queue_index); if (!timer_pending(&tun->flow_gc_timer)) mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + delay)); spin_unlock_bh(&tun->lock); } rcu_read_unlock(); } /* Save the hash received in the stack receive path and update the * flow_hash table accordingly. */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) { if (unlikely(e->rps_rxhash != hash)) e->rps_rxhash = hash; } /* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a * different rxq no. here. */ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_flow_entry *e; u32 txq, numqueues; numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq); txq = e->queue_index; } else { txq = reciprocal_scale(txq, numqueues); } return txq; } static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_prog *prog; u32 numqueues; u16 ret = 0; numqueues = READ_ONCE(tun->numqueues); if (!numqueues) return 0; prog = rcu_dereference(tun->steering_prog); if (prog) ret = bpf_prog_run_clear_cb(prog->prog, skb); return ret % numqueues; } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct tun_struct *tun = netdev_priv(dev); u16 ret; rcu_read_lock(); if (rcu_dereference(tun->steering_prog)) ret = tun_ebpf_select_queue(tun, skb); else ret = tun_automq_select_queue(tun, skb); rcu_read_unlock(); return ret; } static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || (gid_valid(tun->group) && !in_egroup_p(tun->group))) && !ns_capable(net->user_ns, CAP_NET_ADMIN); } static void tun_set_real_num_queues(struct tun_struct *tun) { netif_set_real_num_tx_queues(tun->dev, tun->numqueues); netif_set_real_num_rx_queues(tun->dev, tun->numqueues); } static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile) { tfile->detached = tun; list_add_tail(&tfile->next, &tun->disabled); ++tun->numdisabled; } static struct tun_struct *tun_enable_queue(struct tun_file *tfile) { struct tun_struct *tun = tfile->detached; tfile->detached = NULL; list_del_init(&tfile->next); --tun->numdisabled; return tun; } void tun_ptr_free(void *ptr) { if (!ptr) return; if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); xdp_return_frame(xdpf); } else { __skb_array_destroy_skb(ptr); } } EXPORT_SYMBOL_GPL(tun_ptr_free); static void tun_queue_purge(struct tun_file *tfile) { void *ptr; while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) tun_ptr_free(ptr); skb_queue_purge(&tfile->sk.sk_write_queue); skb_queue_purge(&tfile->sk.sk_error_queue); } static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; struct tun_struct *tun; tun = rtnl_dereference(tfile->tun); if (tun && clean) { if (!tfile->detached) tun_napi_disable(tfile); tun_napi_del(tfile); } if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); rcu_assign_pointer(tun->tfiles[index], tun->tfiles[tun->numqueues - 1]); ntfile = rtnl_dereference(tun->tfiles[index]); ntfile->queue_index = index; ntfile->xdp_rxq.queue_index = index; rcu_assign_pointer(tun->tfiles[tun->numqueues - 1], NULL); --tun->numqueues; if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); } else { tun_disable_queue(tun, tfile); tun_napi_disable(tfile); } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ tun_queue_purge(tfile); tun_set_real_num_queues(tun); } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); sock_put(&tfile->sk); } if (clean) { if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { netif_carrier_off(tun->dev); if (!(tun->flags & IFF_PERSIST) && tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } if (tun) xdp_rxq_info_unreg(&tfile->xdp_rxq); ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); } } static void tun_detach(struct tun_file *tfile, bool clean) { struct tun_struct *tun; struct net_device *dev; rtnl_lock(); tun = rtnl_dereference(tfile->tun); dev = tun ? tun->dev : NULL; __tun_detach(tfile, clean); if (dev) netdev_state_change(dev); rtnl_unlock(); if (clean) sock_put(&tfile->sk); } static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); tun_napi_disable(tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } BUG_ON(tun->numqueues != 0); synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tun_napi_del(tfile); /* Drop read queue */ tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter, bool napi, bool napi_frags, bool publish_tun) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); if (err < 0) goto out; err = -EINVAL; if (rtnl_dereference(tfile->tun) && !tfile->detached) goto out; err = -EBUSY; if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1) goto out; err = -E2BIG; if (!tfile->detached && tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES) goto out; err = 0; /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { lock_sock(tfile->socket.sk); err = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (!err) goto out; } if (!tfile->detached && ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free)) { err = -ENOMEM; goto out; } tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; if (tfile->detached) { /* Re-attach detached tfile, updating XDP queue_index */ WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq)); if (tfile->xdp_rxq.queue_index != tfile->queue_index) tfile->xdp_rxq.queue_index = tfile->queue_index; } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, tun->dev, tfile->queue_index, 0); if (err < 0) goto out; err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { xdp_rxq_info_unreg(&tfile->xdp_rxq); goto out; } err = 0; } if (tfile->detached) { tun_enable_queue(tfile); tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); } if (rtnl_dereference(tun->xdp_prog)) sock_set_flag(&tfile->sk, SOCK_XDP); /* device is allowed to go away first, so no need to hold extra * refcnt. */ /* Publish tfile->tun and tun->tfiles only after we've fully * initialized tfile; otherwise we risk using half-initialized * object. */ if (publish_tun) rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; tun_set_real_num_queues(tun); out: return err; } static struct tun_struct *tun_get(struct tun_file *tfile) { struct tun_struct *tun; rcu_read_lock(); tun = rcu_dereference(tfile->tun); if (tun) dev_hold(tun->dev); rcu_read_unlock(); return tun; } static void tun_put(struct tun_struct *tun) { dev_put(tun->dev); } /* TAP filtering */ static void addr_hash_set(u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; mask[n >> 5] |= (1 << (n & 31)); } static unsigned int addr_hash_test(const u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; return mask[n >> 5] & (1 << (n & 31)); } static int update_filter(struct tap_filter *filter, void __user *arg) { struct { u8 u[ETH_ALEN]; } *addr; struct tun_filter uf; int err, alen, n, nexact; if (copy_from_user(&uf, arg, sizeof(uf))) return -EFAULT; if (!uf.count) { /* Disabled */ filter->count = 0; return 0; } alen = ETH_ALEN * uf.count; addr = memdup_user(arg + sizeof(uf), alen); if (IS_ERR(addr)) return PTR_ERR(addr); /* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst * case we'll accept a few undesired packets. */ filter->count = 0; wmb(); /* Use first set of addresses as an exact filter */ for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++) memcpy(filter->addr[n], addr[n].u, ETH_ALEN); nexact = n; /* Remaining multicast addresses are hashed, * unicast will leave the filter disabled. */ memset(filter->mask, 0, sizeof(filter->mask)); for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) { err = 0; /* no filter */ goto free_addr; } addr_hash_set(filter->mask, addr[n].u); } /* For ALLMULTI just set the mask to all ones. * This overrides the mask populated above. */ if ((uf.flags & TUN_FLT_ALLMULTI)) memset(filter->mask, ~0, sizeof(filter->mask)); /* Now enable the filter */ wmb(); filter->count = nexact; /* Return the number of exact filters */ err = nexact; free_addr: kfree(addr); return err; } /* Returns: 0 - drop, !=0 - accept */ static int run_filter(struct tap_filter *filter, const struct sk_buff *skb) { /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect * at this point. */ struct ethhdr *eh = (struct ethhdr *) skb->data; int i; /* Exact match */ for (i = 0; i < filter->count; i++) if (ether_addr_equal(eh->h_dest, filter->addr[i])) return 1; /* Inexact match (multicast only) */ if (is_multicast_ether_addr(eh->h_dest)) return addr_hash_test(filter->mask, eh->h_dest); return 0; } /* * Checks whether the packet is accepted or not. * Returns: 0 - drop, !=0 - accept */ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb) { if (!filter->count) return 1; return run_filter(filter, skb); } /* Network device part of the driver */ static const struct ethtool_ops tun_ethtool_ops; static int tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct ifreq *ifr = tun->ifr; int err; spin_lock_init(&tun->lock); err = security_tun_dev_alloc_security(&tun->security); if (err < 0) return err; tun_flow_init(tun); dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->hw_enc_features = dev->hw_features; dev->features = dev->hw_features; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); dev->lltx = true; tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, false); if (err < 0) { tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); return err; } return 0; } /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { tun_detach_all(dev); } /* Net device open. */ static int tun_net_open(struct net_device *dev) { netif_tx_start_all_queues(dev); return 0; } /* Net device close. */ static int tun_net_close(struct net_device *dev) { netif_tx_stop_all_queues(dev); return 0; } /* Net device start xmit */ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { #ifdef CONFIG_RPS if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ struct tun_flow_entry *e; __u32 rxhash; rxhash = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); if (e) tun_flow_save_rps_rxhash(e, rxhash); } #endif } static unsigned int run_ebpf_filter(struct tun_struct *tun, struct sk_buff *skb, int len) { struct tun_prog *prog = rcu_dereference(tun->filter_prog); if (prog) len = bpf_prog_run_clear_cb(prog->prog, skb); return len; } /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); /* Drop packet if interface is not attached */ if (!tfile) { drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len); /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ if (!check_filter(&tun->txflt, skb)) { drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop; } if (tfile->socket.sk->sk_filter && sk_filter_reason(tfile->socket.sk, skb, &drop_reason)) goto drop; len = run_ebpf_filter(tun, skb, len); if (len == 0) { drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop; } if (pskb_trim(skb, len)) { drop_reason = SKB_DROP_REASON_NOMEM; goto drop; } if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. */ skb_orphan(skb); nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } /* dev->lltx requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); txq_trans_cond_update(queue); /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); rcu_read_unlock(); return NETDEV_TX_OK; drop: dev_core_stats_tx_dropped_inc(dev); skb_tx_error(skb); kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); return NET_XMIT_DROP; } static void tun_net_mclist(struct net_device *dev) { /* * This callback is supposed to deal with mc filter in * _rx_ path and has nothing to do with the _tx_ path. * In rx path we always accept everything userspace gives us. */ } static netdev_features_t tun_net_fix_features(struct net_device *dev, netdev_features_t features) { struct tun_struct *tun = netdev_priv(dev); return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } static void tun_set_headroom(struct net_device *dev, int new_hr) { struct tun_struct *tun = netdev_priv(dev); if (new_hr < NET_SKB_PAD) new_hr = NET_SKB_PAD; tun->align = new_hr; } static void tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct tun_struct *tun = netdev_priv(dev); dev_get_tstats64(dev, stats); stats->rx_frame_errors += (unsigned long)atomic_long_read(&tun->rx_frame_errors); } static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; struct bpf_prog *old_prog; int i; old_prog = rtnl_dereference(tun->xdp_prog); rcu_assign_pointer(tun->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } list_for_each_entry(tfile, &tun->disabled, next) { if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } return 0; } static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) { if (new_carrier) { struct tun_struct *tun = netdev_priv(dev); if (!tun->numqueues) return -EPERM; netif_carrier_on(dev); } else { netif_carrier_off(dev); } return 0; } static const struct net_device_ops tun_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_select_queue = tun_select_queue, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, .ndo_change_carrier = tun_net_change_carrier, }; static void __tun_xdp_flush_tfile(struct tun_file *tfile) { /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); } static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; int nxmit = 0; int i; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); resample: numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { rcu_read_unlock(); return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); if (unlikely(!tfile)) goto resample; spin_lock(&tfile->tx_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *xdp = frames[i]; /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff. */ void *frame = tun_xdp_to_ptr(xdp); if (__ptr_ring_produce(&tfile->tx_ring, frame)) { dev_core_stats_tx_dropped_inc(dev); break; } nxmit++; } spin_unlock(&tfile->tx_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __tun_xdp_flush_tfile(tfile); rcu_read_unlock(); return nxmit; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); int nxmit; if (unlikely(!frame)) return -EOVERFLOW; nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH); if (!nxmit) xdp_return_frame_rx_napi(frame); return nxmit; } static const struct net_device_ops tap_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_set_rx_mode = tun_net_mclist, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_select_queue = tun_select_queue, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, .ndo_change_carrier = tun_net_change_carrier, }; static void tun_flow_init(struct tun_struct *tun) { int i; for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) INIT_HLIST_HEAD(&tun->flows[i]); tun->ageing_time = TUN_FLOW_EXPIRE; timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0); mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + tun->ageing_time)); } static void tun_flow_uninit(struct tun_struct *tun) { timer_delete_sync(&tun->flow_gc_timer); tun_flow_flush(tun); } #define MIN_MTU 68 #define MAX_MTU 65535 /* Initialize net device. */ static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = 1500; /* Zero header length */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; break; case IFF_TAP: dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; eth_hw_addr_random(dev); /* Currently tun does not support XDP, only tap does. */ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT; break; } dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU - dev->hard_header_len; } static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile) { struct sock *sk = tfile->socket.sk; return (tun->dev->flags & IFF_UP) && sock_writeable(sk); } /* Character device part */ /* Poll */ static __poll_t tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); struct sock *sk; __poll_t mask = 0; if (!tun) return EPOLLERR; sk = tfile->socket.sk; poll_wait(file, sk_sleep(sk), wait); if (!ptr_ring_empty(&tfile->tx_ring)) mask |= EPOLLIN | EPOLLRDNORM; /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to * guarantee EPOLLOUT to be raised by either here or * tun_sock_write_space(). Then process could get notification * after it writes to a down device and meets -EIO. */ if (tun_sock_writeable(tun, tfile) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && tun_sock_writeable(tun, tfile))) mask |= EPOLLOUT | EPOLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) mask = EPOLLERR; tun_put(tun); return mask; } static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, size_t len, const struct iov_iter *it) { struct sk_buff *skb; size_t linear; int err; int i; if (it->nr_segs > MAX_SKB_FRAGS + 1 || len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE); local_bh_disable(); skb = napi_get_frags(&tfile->napi); local_bh_enable(); if (!skb) return ERR_PTR(-ENOMEM); linear = iov_iter_single_seg_count(it); err = __skb_grow(skb, linear); if (err) goto free; skb->len = len; skb->data_len = len - linear; skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { const struct iovec *iov = iter_iov(it) + i; size_t fragsz = iov->iov_len; struct page *page; void *frag; if (fragsz == 0 || fragsz > PAGE_SIZE) { err = -EINVAL; goto free; } frag = netdev_alloc_frag(fragsz); if (!frag) { err = -ENOMEM; goto free; } page = virt_to_head_page(frag); skb_fill_page_desc(skb, i - 1, page, frag - page_address(page), fragsz); } return skb; free: /* frees skb and all frags allocated with napi_alloc_frag() */ napi_free_frags(&tfile->napi); return ERR_PTR(err); } /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, size_t prepad, size_t len, size_t linear, int noblock) { struct sock *sk = tfile->socket.sk; struct sk_buff *skb; int err; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, &err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return ERR_PTR(err); skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, int more) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; u32 rx_batched = tun->rx_batched; bool rcv = false; if (!rx_batched || (!more && skb_queue_empty(queue))) { local_bh_disable(); skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); return; } spin_lock(&queue->lock); if (!more || skb_queue_len(queue) == rx_batched) { __skb_queue_head_init(&process_queue); skb_queue_splice_tail_init(queue, &process_queue); rcv = true; } else { __skb_queue_tail(queue, skb); } spin_unlock(&queue->lock); if (rcv) { struct sk_buff *nskb; local_bh_disable(); while ((nskb = __skb_dequeue(&process_queue))) { skb_record_rx_queue(nskb, tfile->queue_index); netif_receive_skb(nskb); } skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); } } static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, int len, int noblock, bool zerocopy) { if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) return false; if (tfile->socket.sk->sk_sndbuf != INT_MAX) return false; if (!noblock) return false; if (zerocopy) return false; if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; return true; } static struct sk_buff *__tun_build_skb(struct tun_file *tfile, struct page_frag *alloc_frag, char *buf, int buflen, int len, int pad, int metasize) { struct sk_buff *skb = build_skb(buf, buflen); if (!skb) return ERR_PTR(-ENOMEM); skb_reserve(skb, pad); skb_put(skb, len); if (metasize) skb_metadata_set(skb, metasize); skb_set_owner_w(skb, tfile->socket.sk); get_page(alloc_frag->page); alloc_frag->offset += buflen; return skb; } static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, struct xdp_buff *xdp, u32 act) { int err; switch (act) { case XDP_REDIRECT: err = xdp_do_redirect(tun->dev, xdp, xdp_prog); if (err) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_TX: err = tun_xdp_tx(tun->dev, xdp); if (err < 0) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_PASS: break; default: bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tun->dev, xdp_prog, act); fallthrough; case XDP_DROP: dev_core_stats_rx_dropped_inc(tun->dev); break; } return act; } static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *from, struct virtio_net_hdr *hdr, int len, int *skb_xdp) { struct page_frag *alloc_frag = &current->task_frag; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); char *buf; size_t copied; int pad = TUN_RX_PAD; int metasize = 0; int err = 0; rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) pad += XDP_PACKET_HEADROOM; buflen += SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) return ERR_PTR(-ENOMEM); buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + pad, len, from); if (copied != len) return ERR_PTR(-EFAULT); /* There's a small window that XDP may be set after the check * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough. */ if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad, metasize); } *skb_xdp = 0; local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { struct xdp_buff xdp; u32 act; xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); xdp_prepare_buff(&xdp, buf, pad, len, true); act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { get_page(alloc_frag->page); alloc_frag->offset += buflen; } err = tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) { if (act == XDP_REDIRECT || act == XDP_TX) put_page(alloc_frag->page); goto out; } if (err == XDP_REDIRECT) xdp_do_flush(); if (err != XDP_PASS) goto out; pad = xdp.data - xdp.data_hard_start; len = xdp.data_end - xdp.data; /* It is known that the xdp_buff was prepared with metadata * support, so the metasize will never be negative. */ metasize = xdp.data - xdp.data_meta; } bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad, metasize); out: bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return NULL; } /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, int noblock, bool more) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr_v1_hash_tunnel hdr; struct virtio_net_hdr *gso; int good_linear; int copylen; int hdr_len = 0; bool zerocopy = false; int err; u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; netdev_features_t features = 0; /* * Keep it easy and always zero the whole buffer, even if the * tunnel-related field will be touched only when the feature * is enabled and the hdr size id compatible. */ memset(&hdr, 0, sizeof(hdr)); gso = (struct virtio_net_hdr *)&hdr; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL; len -= sizeof(pi); if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT; } if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); features = tun_vnet_hdr_guest_features(vnet_hdr_sz); hdr_len = __tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, features, from, gso); if (hdr_len < 0) return hdr_len; len -= vnet_hdr_sz; } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN))) return -EINVAL; } good_linear = SKB_MAX_HEAD(align); if (msg_control) { struct iov_iter i = *from; /* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ copylen = min(hdr_len ? hdr_len : GOODCOPY_LEN, good_linear); linear = copylen; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. */ skb = tun_build_skb(tun, tfile, from, gso, len, &skb_xdp); err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (!skb) return total_len; } else { if (!zerocopy) { copylen = len; linear = min(hdr_len, good_linear); } if (frags) { mutex_lock(&tfile->napi_mutex); skb = tun_napi_alloc_frags(tfile, copylen, from); /* tun_napi_alloc_frags() enforces a layout for the skb. * If zerocopy is enabled, then this layout will be * overwritten by zerocopy_sg_from_iter(). */ zerocopy = false; } else { if (!linear) linear = min_t(size_t, good_linear, copylen); skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); } err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { err = -EFAULT; drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } } if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, &hdr)) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; } switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; switch (ip_version) { case 4: pi.proto = htons(ETH_P_IP); break; case 6: pi.proto = htons(ETH_P_IPV6); break; default: err = -EINVAL; goto drop; } } skb_reset_mac_header(skb); skb->protocol = pi.proto; skb->dev = tun->dev; break; case IFF_TAP: if (frags && !pskb_may_pull(skb, ETH_HLEN)) { err = -ENOMEM; drop_reason = SKB_DROP_REASON_HDR_TRUNC; goto drop; } skb->protocol = eth_type_trans(skb, tun->dev); break; } /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { struct bpf_prog *xdp_prog; int ret; local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); goto unlock_frags; } } rcu_read_unlock(); local_bh_enable(); } /* Compute the costly rx hash only if needed for flow updates. * We may get a very small possibility of OOO during switching, not * worth to optimize. */ if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); rcu_read_lock(); if (unlikely(!(tun->dev->flags & IFF_UP))) { err = -EIO; rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (frags) { u32 headlen; /* Exercise flow dissector code path. */ skb_push(skb, ETH_HLEN); headlen = eth_get_headlen(tun->dev, skb->data, skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { WARN_ON_ONCE(1); err = -ENOMEM; dev_core_stats_rx_dropped_inc(tun->dev); napi_busy: napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); return err; } if (likely(napi_schedule_prep(&tfile->napi))) { local_bh_disable(); napi_gro_frags(&tfile->napi); napi_complete(&tfile->napi); local_bh_enable(); } else { err = -EBUSY; goto napi_busy; } mutex_unlock(&tfile->napi_mutex); } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; spin_lock_bh(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock_bh(&queue->lock); rcu_read_unlock(); err = -EBUSY; goto free_skb; } __skb_queue_tail(queue, skb); queue_len = skb_queue_len(queue); spin_unlock(&queue->lock); if (!more || queue_len > NAPI_POLL_WEIGHT) napi_schedule(&tfile->napi); local_bh_enable(); } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { tun_rx_batched(tun, tfile, skb, more); } else { netif_rx(skb); } rcu_read_unlock(); preempt_disable(); dev_sw_netstats_rx_add(tun->dev, len); preempt_enable(); if (rxhash) tun_flow_update(tun, rxhash, tfile); return total_len; drop: if (err != -EAGAIN) dev_core_stats_rx_dropped_inc(tun->dev); free_skb: if (!IS_ERR_OR_NULL(skb)) kfree_skb_reason(skb, drop_reason); unlock_frags: if (frags) { tfile->napi.skb = NULL; mutex_unlock(&tfile->napi_mutex); } return err ?: total_len; } static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t result; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; result = tun_get_user(tun, tfile, NULL, from, noblock, false); tun_put(tun); return result; } static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, struct xdp_frame *xdp_frame, struct iov_iter *iter) { int vnet_hdr_sz = 0; size_t size = xdp_frame->len; ssize_t ret; if (tun->flags & IFF_VNET_HDR) { struct virtio_net_hdr gso = { 0 }; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); if (ret) return ret; } ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, ret); preempt_enable(); return ret; } /* Put packet to the user space buffer */ static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; ssize_t total; int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; int ret; if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; if (tun->flags & IFF_VNET_HDR) vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); total = skb->len + vlan_hlen + vnet_hdr_sz; if (!(tun->flags & IFF_NO_PI)) { if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; total += sizeof(pi); if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; } if (vnet_hdr_sz) { struct virtio_net_hdr_v1_hash_tunnel hdr; struct virtio_net_hdr *gso; ret = tun_vnet_hdr_tnl_from_skb(tun->flags, tun->dev, skb, &hdr); if (ret) return ret; /* * Drop the packet if the configured header size is too small * WRT the enabled offloads. */ gso = (struct virtio_net_hdr *)&hdr; ret = __tun_vnet_hdr_put(vnet_hdr_sz, tun->dev->features, iter, gso); if (ret) return ret; } if (vlan_hlen) { int ret; struct veth veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: /* caller is in process context, */ preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen); preempt_enable(); return total; } static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) { DECLARE_WAITQUEUE(wait, current); void *ptr = NULL; int error = 0; ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) goto out; if (noblock) { error = -EAGAIN; goto out; } add_wait_queue(&tfile->socket.wq.wait, &wait); while (1) { set_current_state(TASK_INTERRUPTIBLE); ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) break; if (signal_pending(current)) { error = -ERESTARTSYS; break; } if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { error = -EFAULT; break; } schedule(); } __set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->socket.wq.wait, &wait); out: *err = error; return ptr; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock, void *ptr) { ssize_t ret; int err; if (!iov_iter_count(to)) { tun_ptr_free(ptr); return 0; } if (!ptr) { /* Read frames from ring */ ptr = tun_ring_recv(tfile, noblock, &err); if (!ptr) return err; } if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); ret = tun_put_user_xdp(tun, tfile, xdpf, to); xdp_return_frame(xdpf); } else { struct sk_buff *skb = ptr; ret = tun_put_user(tun, tfile, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t len = iov_iter_count(to), ret; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tun_do_read(tun, tfile, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; tun_put(tun); return ret; } static void tun_prog_free(struct rcu_head *rcu) { struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu); bpf_prog_destroy(prog->prog); kfree(prog); } static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, struct bpf_prog *prog) { struct tun_prog *old, *new = NULL; if (prog) { new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new->prog = prog; } spin_lock_bh(&tun->lock); old = rcu_dereference_protected(*prog_p, lockdep_is_held(&tun->lock)); rcu_assign_pointer(*prog_p, new); spin_unlock_bh(&tun->lock); if (old) call_rcu(&old->rcu, tun_prog_free); return 0; } static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); __tun_set_ebpf(tun, &tun->filter_prog, NULL); } static void tun_setup(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); tun->owner = INVALID_UID; tun->group = INVALID_GID; tun_default_link_ksettings(dev, &tun->link_ksettings); dev->ethtool_ops = &tun_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = tun_free_netdev; /* We prefer our own queue length */ dev->tx_queue_len = TUN_READQ_SIZE; } /* Trivial set of netlink ops to allow deleting tun or tap * device with netlink. */ static int tun_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "tun/tap creation via rtnetlink is not supported."); return -EOPNOTSUPP; } static size_t tun_get_size(const struct net_device *dev) { BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t)); BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t)); return nla_total_size(sizeof(uid_t)) + /* OWNER */ nla_total_size(sizeof(gid_t)) + /* GROUP */ nla_total_size(sizeof(u8)) + /* TYPE */ nla_total_size(sizeof(u8)) + /* PI */ nla_total_size(sizeof(u8)) + /* VNET_HDR */ nla_total_size(sizeof(u8)) + /* PERSIST */ nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */ nla_total_size(sizeof(u32)) + /* NUM_QUEUES */ nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */ 0; } static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK)) goto nla_put_failure; if (uid_valid(tun->owner) && nla_put_u32(skb, IFLA_TUN_OWNER, from_kuid_munged(current_user_ns(), tun->owner))) goto nla_put_failure; if (gid_valid(tun->group) && nla_put_u32(skb, IFLA_TUN_GROUP, from_kgid_munged(current_user_ns(), tun->group))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE, !!(tun->flags & IFF_MULTI_QUEUE))) goto nla_put_failure; if (tun->flags & IFF_MULTI_QUEUE) { if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES, tun->numdisabled)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops tun_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct tun_struct), .setup = tun_setup, .validate = tun_validate, .get_size = tun_get_size, .fill_info = tun_fill_info, }; static void tun_sock_write_space(struct sock *sk) { struct tun_file *tfile; wait_queue_head_t *wqueue; if (!sock_writeable(sk)) return; if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_sync_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); tfile = container_of(sk, struct tun_file, sk); kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } static void tun_put_page(struct tun_page *tpage) { if (tpage->page) __page_frag_cache_drain(tpage->page, tpage->count); } static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush, struct tun_page *tpage) { unsigned int datasize = xdp->data_end - xdp->data; struct virtio_net_hdr *gso = xdp->data_hard_start; struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; struct sk_buff_head *queue; netdev_features_t features; u32 rxhash = 0, act; int buflen = xdp->frame_sz; int metasize = 0; int ret = 0; bool skb_xdp = false; struct page *page; if (unlikely(datasize < ETH_HLEN)) return -EINVAL; xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { if (gso->gso_type) { skb_xdp = true; goto build; } xdp_init_buff(xdp, buflen, &tfile->xdp_rxq); act = bpf_prog_run_xdp(xdp_prog, xdp); ret = tun_xdp_act(tun, xdp_prog, xdp, act); if (ret < 0) { put_page(virt_to_head_page(xdp->data)); return ret; } switch (ret) { case XDP_REDIRECT: *flush = true; fallthrough; case XDP_TX: return 0; case XDP_PASS: break; default: page = virt_to_head_page(xdp->data); if (tpage->page == page) { ++tpage->count; } else { tun_put_page(tpage); tpage->page = page; tpage->count = 1; } return 0; } } build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { ret = -ENOMEM; goto out; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); /* The externally provided xdp_buff may have no metadata support, which * is marked by xdp->data_meta being xdp->data + 1. This will lead to a * metasize of -1 and is the reason why the condition checks for > 0. */ metasize = xdp->data - xdp->data_meta; if (metasize > 0) skb_metadata_set(skb, metasize); features = tun_vnet_hdr_guest_features(READ_ONCE(tun->vnet_hdr_sz)); tnl_hdr = (struct virtio_net_hdr_v1_hash_tunnel *)gso; if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, tnl_hdr)) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; goto out; } skb->protocol = eth_type_trans(skb, tun->dev); skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { ret = 0; goto out; } } if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); if (tfile->napi_enabled) { queue = &tfile->sk.sk_write_queue; spin_lock(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock(&queue->lock); kfree_skb(skb); return -EBUSY; } __skb_queue_tail(queue, skb); spin_unlock(&queue->lock); ret = 1; } else { netif_receive_skb(skb); ret = 0; } /* No need to disable preemption here since this function is * always called with bh disabled */ dev_sw_netstats_rx_add(tun->dev, datasize); if (rxhash) tun_flow_update(tun, rxhash, tfile); out: return ret; } static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret, i; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; if (!tun) return -EBADFD; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct tun_page tpage; int n = ctl->num; int flush = 0, queued = 0; memset(&tpage, 0, sizeof(tpage)); local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); for (i = 0; i < n; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage); if (ret > 0) queued += ret; } if (flush) xdp_do_flush(); if (tfile->napi_enabled && queued > 0) napi_schedule(&tfile->napi); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); tun_put_page(&tpage); ret = total_len; goto out; } ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); out: tun_put(tun); return ret; } static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); void *ptr = m->msg_control; int ret; if (!tun) { ret = -EBADFD; goto out_free; } if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out_put_tun; } if (flags & MSG_ERRQUEUE) { ret = sock_recv_errqueue(sock->sk, m, total_len, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr); if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } out: tun_put(tun); return ret; out_put_tun: tun_put(tun); out_free: tun_ptr_free(ptr); return ret; } static int tun_ptr_peek_len(void *ptr) { if (likely(ptr)) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } else { return 0; } } static int tun_peek_len(struct socket *sock) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun; int ret = 0; tun = tun_get(tfile); if (!tun) return 0; ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); tun_put(tun); return ret; } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }; static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) { return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return sysfs_emit(buf, "0x%x\n", tun_flags(tun)); } static ssize_t owner_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return uid_valid(tun->owner)? sysfs_emit(buf, "%u\n", from_kuid_munged(current_user_ns(), tun->owner)) : sysfs_emit(buf, "-1\n"); } static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return gid_valid(tun->group) ? sysfs_emit(buf, "%u\n", from_kgid_munged(current_user_ns(), tun->group)) : sysfs_emit(buf, "-1\n"); } static DEVICE_ATTR_RO(tun_flags); static DEVICE_ATTR_RO(owner); static DEVICE_ATTR_RO(group); static struct attribute *tun_dev_attrs[] = { &dev_attr_tun_flags.attr, &dev_attr_owner.attr, &dev_attr_group.attr, NULL }; static const struct attribute_group tun_attr_group = { .attrs = tun_dev_attrs }; static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; struct tun_file *tfile = file->private_data; struct net_device *dev; int err; if (tfile->detached) return -EINVAL; if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!(ifr->ifr_flags & IFF_NAPI) || (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) return -EINVAL; } dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) return -EBUSY; if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) tun = netdev_priv(dev); else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops) tun = netdev_priv(dev); else return -EINVAL; if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; if (tun_not_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) return err; err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, true); if (err < 0) return err; if (tun->flags & IFF_MULTI_QUEUE && (tun->numqueues + tun->numdisabled > 1)) { /* One or more queue has already been attached, no need * to initialize the device again. */ netdev_state_change(dev); return 0; } tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); netdev_state_change(dev); } else { char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = security_tun_dev_create(); if (err < 0) return err; /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; if (*ifr->ifr_name) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, NET_NAME_UNKNOWN, tun_setup, queues, queues); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; tun->flags = flags; tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL); tun->ifr = ifr; tun->file = file; tun_net_initialize(dev); err = register_netdevice(tun->dev); if (err < 0) { free_netdev(dev); return err; } /* free_netdev() won't check refcnt, to avoid race * with dev_put() we need publish tun after registration. */ rcu_assign_pointer(tfile->tun, tun); } if (ifr->ifr_flags & IFF_NO_CARRIER) netif_carrier_off(tun->dev); else netif_carrier_on(tun->dev); /* Make sure persistent devices do not get stuck in * xoff state. */ if (netif_running(tun->dev)) netif_tx_wake_all_queues(tun->dev); strcpy(ifr->ifr_name, tun->dev->name); return 0; } static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) { strcpy(ifr->ifr_name, tun->dev->name); ifr->ifr_flags = tun_flags(tun); } #define PLAIN_GSO (NETIF_F_GSO_UDP_L4 | NETIF_F_TSO | NETIF_F_TSO6) /* This is like a cut-down ethtool ops, except done via tun fd so no * privs required. */ static int set_offload(struct tun_struct *tun, unsigned long arg) { netdev_features_t features = 0; if (arg & TUN_F_CSUM) { features |= NETIF_F_HW_CSUM; arg &= ~TUN_F_CSUM; if (arg & (TUN_F_TSO4|TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) { features |= NETIF_F_TSO_ECN; arg &= ~TUN_F_TSO_ECN; } if (arg & TUN_F_TSO4) features |= NETIF_F_TSO; if (arg & TUN_F_TSO6) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } arg &= ~TUN_F_UFO; /* TODO: for now USO4 and USO6 should work simultaneously */ if (arg & TUN_F_USO4 && arg & TUN_F_USO6) { features |= NETIF_F_GSO_UDP_L4; arg &= ~(TUN_F_USO4 | TUN_F_USO6); } /* * Tunnel offload is allowed only if some plain offload is * available, too. */ if (features & PLAIN_GSO && arg & TUN_F_UDP_TUNNEL_GSO) { features |= NETIF_F_GSO_UDP_TUNNEL; if (arg & TUN_F_UDP_TUNNEL_GSO_CSUM) features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; arg &= ~(TUN_F_UDP_TUNNEL_GSO | TUN_F_UDP_TUNNEL_GSO_CSUM); } } /* This gives the user a way to test for new features in future by * trying to set them. */ if (arg) return -EINVAL; tun->set_features = features; tun->dev->wanted_features &= ~TUN_USER_FEATURES; tun->dev->wanted_features |= features; netdev_update_features(tun->dev); return 0; } static void tun_detach_filter(struct tun_struct *tun, int n) { int i; struct tun_file *tfile; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); sk_detach_filter(tfile->socket.sk); release_sock(tfile->socket.sk); } tun->filter_attached = false; } static int tun_attach_filter(struct tun_struct *tun) { int i, ret = 0; struct tun_file *tfile; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); return ret; } } tun->filter_attached = true; return ret; } static void tun_set_sndbuf(struct tun_struct *tun) { struct tun_file *tfile; int i; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_sndbuf = tun->sndbuf; } } static int tun_set_queue(struct file *file, struct ifreq *ifr) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; int ret = 0; rtnl_lock(); if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { tun = tfile->detached; if (!tun) { ret = -EINVAL; goto unlock; } ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI, tun->flags & IFF_NAPI_FRAGS, true); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); } else ret = -EINVAL; if (ret >= 0) netdev_state_change(tun->dev); unlock: rtnl_unlock(); return ret; } static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, void __user *data) { struct bpf_prog *prog; int fd; if (copy_from_user(&fd, data, sizeof(fd))) return -EFAULT; if (fd == -1) { prog = NULL; } else { prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) return PTR_ERR(prog); } return __tun_set_ebpf(tun, prog_p, prog); } /* Return correct value for tun->dev->addr_len based on tun->dev->type. */ static unsigned char tun_get_addr_len(unsigned short type) { switch (type) { case ARPHRD_IP6GRE: case ARPHRD_TUNNEL6: return sizeof(struct in6_addr); case ARPHRD_IPGRE: case ARPHRD_TUNNEL: case ARPHRD_SIT: return 4; case ARPHRD_ETHER: return ETH_ALEN; case ARPHRD_IEEE802154: case ARPHRD_IEEE802154_MONITOR: return IEEE802154_EXTENDED_ADDR_LEN; case ARPHRD_PHONET_PIPE: case ARPHRD_PPP: case ARPHRD_NONE: return 0; case ARPHRD_6LOWPAN: return EUI64_ADDR_LEN; case ARPHRD_FDDI: return FDDI_K_ALEN; case ARPHRD_HIPPI: return HIPPI_ALEN; case ARPHRD_IEEE802: return FC_ALEN; case ARPHRD_ROSE: return ROSE_ADDR_LEN; case ARPHRD_NETROM: return AX25_ADDR_LEN; case ARPHRD_LOCALTLK: return LTALK_ALEN; default: return 0; } } static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { struct tun_file *tfile = file->private_data; struct net *net = sock_net(&tfile->sk); struct tun_struct *tun; void __user* argp = (void __user*)arg; unsigned int carrier; struct ifreq ifr; kuid_t owner; kgid_t group; int ifindex; int sndbuf; int ret; bool do_notify = false; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } else { memset(&ifr, 0, sizeof(ifr)); } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on * TUNSETIFF. */ return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) { return tun_set_queue(file, &ifr); } else if (cmd == SIOCGSKNS) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; return open_related_ns(&net->ns, get_net_ns); } rtnl_lock(); tun = tun_get(tfile); if (cmd == TUNSETIFF) { ret = -EEXIST; if (tun) goto unlock; ifr.ifr_name[IFNAMSIZ-1] = '\0'; ret = tun_set_iff(net, file, &ifr); if (ret) goto unlock; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; goto unlock; } if (cmd == TUNSETIFINDEX) { ret = -EPERM; if (tun) goto unlock; ret = -EFAULT; if (copy_from_user(&ifindex, argp, sizeof(ifindex))) goto unlock; ret = -EINVAL; if (ifindex < 0) goto unlock; ret = 0; tfile->ifindex = ifindex; goto unlock; } ret = -EBADFD; if (!tun) goto unlock; netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd); net = dev_net(tun->dev); ret = 0; switch (cmd) { case TUNGETIFF: tun_get_iff(tun, &ifr); if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; if (!tfile->socket.sk->sk_filter) ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case TUNSETNOCSUM: /* Disable/Enable checksum */ /* [unimplemented] */ netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n", arg ? "disabled" : "enabled"); break; case TUNSETPERSIST: /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ if (arg && !(tun->flags & IFF_PERSIST)) { tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); do_notify = true; } if (!arg && (tun->flags & IFF_PERSIST)) { tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); do_notify = true; } netif_info(tun, drv, tun->dev, "persist %s\n", arg ? "enabled" : "disabled"); break; case TUNSETOWNER: /* Set owner of the device */ owner = make_kuid(current_user_ns(), arg); if (!uid_valid(owner)) { ret = -EINVAL; break; } tun->owner = owner; do_notify = true; netif_info(tun, drv, tun->dev, "owner set to %u\n", from_kuid(&init_user_ns, tun->owner)); break; case TUNSETGROUP: /* Set group of the device */ group = make_kgid(current_user_ns(), arg); if (!gid_valid(group)) { ret = -EINVAL; break; } tun->group = group; do_notify = true; netif_info(tun, drv, tun->dev, "group set to %u\n", from_kgid(&init_user_ns, tun->group)); break; case TUNSETLINK: /* Only allow setting the type when the interface is down */ if (tun->dev->flags & IFF_UP) { netif_info(tun, drv, tun->dev, "Linktype set failed because interface is up\n"); ret = -EBUSY; } else { ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, tun->dev); ret = notifier_to_errno(ret); if (ret) { netif_info(tun, drv, tun->dev, "Refused to change device type\n"); break; } tun->dev->type = (int) arg; tun->dev->addr_len = tun_get_addr_len(tun->dev->type); netif_info(tun, drv, tun->dev, "linktype set to %d\n", tun->dev->type); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, tun->dev); } break; case TUNSETDEBUG: tun->msg_enable = (u32)arg; break; case TUNSETOFFLOAD: ret = set_offload(tun, arg); break; case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = update_filter(&tun->txflt, (void __user *)arg); break; case SIOCGIFHWADDR: /* Get hw address */ netif_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case SIOCSIFHWADDR: /* Set hw address */ if (tun->dev->addr_len > sizeof(ifr.ifr_hwaddr)) { ret = -EINVAL; break; } ret = dev_set_mac_address_user(tun->dev, (struct sockaddr_storage *)&ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: sndbuf = tfile->socket.sk->sk_sndbuf; if (copy_to_user(argp, &sndbuf, sizeof(sndbuf))) ret = -EFAULT; break; case TUNSETSNDBUF: if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) { ret = -EFAULT; break; } if (sndbuf <= 0) { ret = -EINVAL; break; } tun->sndbuf = sndbuf; tun_set_sndbuf(tun); break; case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) break; ret = tun_attach_filter(tun); break; case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = 0; tun_detach_filter(tun, tun->numqueues); break; case TUNGETFILTER: ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) break; ret = 0; break; case TUNSETSTEERINGEBPF: ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; case TUNSETFILTEREBPF: ret = tun_set_ebpf(tun, &tun->filter_prog, argp); break; case TUNSETCARRIER: ret = -EFAULT; if (copy_from_user(&carrier, argp, sizeof(carrier))) goto unlock; ret = tun_net_change_carrier(tun->dev, (bool)carrier); break; case TUNGETDEVNETNS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto unlock; ret = open_related_ns(&net->ns, get_net_ns); break; default: ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); break; } if (do_notify) netdev_state_change(tun->dev); unlock: rtnl_unlock(); if (tun) tun_put(tun); return ret; } static long tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq)); } #ifdef CONFIG_COMPAT static long tun_chr_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case TUNSETIFF: case TUNGETIFF: case TUNSETTXFILTER: case TUNGETSNDBUF: case TUNSETSNDBUF: case SIOCGIFHWADDR: case SIOCSIFHWADDR: arg = (unsigned long)compat_ptr(arg); break; default: arg = (compat_ulong_t)arg; break; } /* * compat_ifreq is shorter than ifreq, so we must not access beyond * the end of that structure. All fields that are used in this * driver are compatible though, we don't need to convert the * contents. */ return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq)); } #endif /* CONFIG_COMPAT */ static int tun_chr_fasync(int fd, struct file *file, int on) { struct tun_file *tfile = file->private_data; int ret; if (on) { ret = file_f_owner_allocate(file); if (ret) goto out; } if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; if (on) { __f_setown(file, task_pid(current), PIDTYPE_TGID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; ret = 0; out: return ret; } static int tun_chr_open(struct inode *inode, struct file * file) { struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); if (!tfile) return -ENOMEM; if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) { sk_free(&tfile->sk); return -ENOMEM; } mutex_init(&tfile->napi_mutex); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; init_waitqueue_head(&tfile->socket.wq.wait); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid()); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); /* tun groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; return 0; } static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; tun_detach(tfile, true); return 0; } #ifdef CONFIG_PROC_FS static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); rtnl_lock(); tun = tun_get(tfile); if (tun) tun_get_iff(tun, &ifr); rtnl_unlock(); if (tun) tun_put(tun); seq_printf(m, "iff:\t%s\n", ifr.ifr_name); } #endif static const struct file_operations tun_fops = { .owner = THIS_MODULE, .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = tun_chr_compat_ioctl, #endif .open = tun_chr_open, .release = tun_chr_close, .fasync = tun_chr_fasync, #ifdef CONFIG_PROC_FS .show_fdinfo = tun_chr_show_fdinfo, #endif }; static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", .nodename = "net/tun", .fops = &tun_fops, }; /* ethtool interface */ static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_zero_link_mode(cmd, advertising); cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; } static int tun_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(cmd, &tun->link_ksettings, sizeof(*cmd)); return 0; } static int tun_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(&tun->link_ksettings, cmd, sizeof(*cmd)); return 0; } static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct tun_struct *tun = netdev_priv(dev); strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: strscpy(info->bus_info, "tun", sizeof(info->bus_info)); break; case IFF_TAP: strscpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } } static u32 tun_get_msglevel(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); return tun->msg_enable; } static void tun_set_msglevel(struct net_device *dev, u32 value) { struct tun_struct *tun = netdev_priv(dev); tun->msg_enable = value; } static int tun_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); ec->rx_max_coalesced_frames = tun->rx_batched; return 0; } static int tun_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT) tun->rx_batched = NAPI_POLL_WEIGHT; else tun->rx_batched = ec->rx_max_coalesced_frames; return 0; } static void tun_get_channels(struct net_device *dev, struct ethtool_channels *channels) { struct tun_struct *tun = netdev_priv(dev); channels->combined_count = tun->numqueues; channels->max_combined = tun->flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; } static const struct ethtool_ops tun_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES, .get_drvinfo = tun_get_drvinfo, .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, .get_channels = tun_get_channels, .get_ts_info = ethtool_op_get_ts_info, .get_coalesce = tun_get_coalesce, .set_coalesce = tun_set_coalesce, .get_link_ksettings = tun_get_link_ksettings, .set_link_ksettings = tun_set_link_ksettings, }; static int tun_queue_resize(struct tun_struct *tun) { struct net_device *dev = tun->dev; struct tun_file *tfile; struct ptr_ring **rings; int n = tun->numqueues + tun->numdisabled; int ret, i; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); rings[i] = &tfile->tx_ring; } list_for_each_entry(tfile, &tun->disabled, next) rings[i++] = &tfile->tx_ring; ret = ptr_ring_resize_multiple_bh(rings, n, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free); kfree(rings); return ret; } static int tun_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tun_struct *tun = netdev_priv(dev); int i; if (dev->rtnl_link_ops != &tun_link_ops) return NOTIFY_DONE; switch (event) { case NETDEV_CHANGE_TX_QUEUE_LEN: if (tun_queue_resize(tun)) return NOTIFY_BAD; break; case NETDEV_UP: for (i = 0; i < tun->numqueues; i++) { struct tun_file *tfile; tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_write_space(tfile->socket.sk); } break; default: break; } return NOTIFY_DONE; } static struct notifier_block tun_notifier_block __read_mostly = { .notifier_call = tun_device_event, }; static int __init tun_init(void) { int ret = 0; pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION); ret = rtnl_link_register(&tun_link_ops); if (ret) { pr_err("Can't register link_ops\n"); goto err_linkops; } ret = misc_register(&tun_miscdev); if (ret) { pr_err("Can't register misc device %d\n", TUN_MINOR); goto err_misc; } ret = register_netdevice_notifier(&tun_notifier_block); if (ret) { pr_err("Can't register netdevice notifier\n"); goto err_notifier; } return 0; err_notifier: misc_deregister(&tun_miscdev); err_misc: rtnl_link_unregister(&tun_link_ops); err_linkops: return ret; } static void __exit tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tun_get_socket(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket); struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->tx_ring; } EXPORT_SYMBOL_GPL(tun_get_tx_ring); module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); MODULE_ALIAS("devname:net/tun"); MODULE_IMPORT_NS("NETDEV_INTERNAL");
1191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* SPDX-License-Identifier: GPL-2.0 */ /* * Because linux/module.h has tracepoints in the header, and ftrace.h * used to include this file, define_trace.h includes linux/module.h * But we do not want the module.h to override the TRACE_SYSTEM macro * variable that define_trace.h is processing, so we only set it * when module events are being processed, which would happen when * CREATE_TRACE_POINTS is defined. */ #ifdef CREATE_TRACE_POINTS #undef TRACE_SYSTEM #define TRACE_SYSTEM module #endif #if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MODULE_H #include <linux/tracepoint.h> #ifdef CONFIG_MODULES struct module; #define show_module_flags(flags) __print_flags(flags, "", \ { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \ { (1UL << TAINT_OOT_MODULE), "O" }, \ { (1UL << TAINT_FORCED_MODULE), "F" }, \ { (1UL << TAINT_CRAP), "C" }, \ { (1UL << TAINT_UNSIGNED_MODULE), "E" }) TRACE_EVENT(module_load, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __field( unsigned int, taints ) __string( name, mod->name ) ), TP_fast_assign( __entry->taints = mod->taints; __assign_str(name); ), TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints)) ); TRACE_EVENT(module_free, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __string( name, mod->name ) ), TP_fast_assign( __assign_str(name); ), TP_printk("%s", __get_str(name)) ); #ifdef CONFIG_MODULE_UNLOAD /* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */ DECLARE_EVENT_CLASS(module_refcnt, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( int, refcnt ) __string( name, mod->name ) ), TP_fast_assign( __entry->ip = ip; __entry->refcnt = atomic_read(&mod->refcnt); __assign_str(name); ), TP_printk("%s call_site=%ps refcnt=%d", __get_str(name), (void *)__entry->ip, __entry->refcnt) ); DEFINE_EVENT(module_refcnt, module_get, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); DEFINE_EVENT(module_refcnt, module_put, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); #endif /* CONFIG_MODULE_UNLOAD */ TRACE_EVENT(module_request, TP_PROTO(char *name, bool wait, unsigned long ip), TP_ARGS(name, wait, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( bool, wait ) __string( name, name ) ), TP_fast_assign( __entry->ip = ip; __entry->wait = wait; __assign_str(name); ), TP_printk("%s wait=%d call_site=%ps", __get_str(name), (int)__entry->wait, (void *)__entry->ip) ); #endif /* CONFIG_MODULES */ #endif /* _TRACE_MODULE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
833 99 87 5 238 492 2554 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> #include <linux/mm.h> struct ctl_table_header; struct mempolicy; /* * This is not completely implemented yet. The idea is to * create an in-memory tree (like the actual /proc filesystem * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * * parent/subdir are used for the directory structure (every /proc file has a * parent, but "subdir" is empty for all non-directory entries). * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { /* * number of callers into module in progress; * negative -> it's going away RSN */ atomic_t in_use; refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; union { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); }; proc_write_t write; void *data; unsigned int state_size; unsigned int low_ino; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; struct proc_dir_entry *parent; struct rb_root subdir; struct rb_node subdir_node; char *name; umode_t mode; u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; #define SIZEOF_PDE ( \ sizeof(struct proc_dir_entry) < 128 ? 128 : \ sizeof(struct proc_dir_entry) < 192 ? 192 : \ sizeof(struct proc_dir_entry) < 256 ? 256 : \ sizeof(struct proc_dir_entry) < 512 ? 512 : \ 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) static inline bool pde_is_permanent(const struct proc_dir_entry *pde) { return pde->flags & PROC_ENTRY_PERMANENT; } static inline void pde_make_permanent(struct proc_dir_entry *pde) { pde->flags |= PROC_ENTRY_PERMANENT; } static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde) { return pde->flags & PROC_ENTRY_proc_read_iter; } static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde) { #ifdef CONFIG_COMPAT return pde->flags & PROC_ENTRY_proc_compat_ioctl; #else return false; #endif } static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde) { return pde->flags & PROC_ENTRY_proc_lseek; } extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); int lsmid; }; struct proc_inode { struct pid *pid; unsigned int fd; union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; const struct ctl_table *sysctl_entry; struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; /* * General functions */ static inline struct proc_inode *PROC_I(const struct inode *inode) { return container_of(inode, struct proc_inode, vfs_inode); } static inline struct proc_dir_entry *PDE(const struct inode *inode) { return PROC_I(inode)->pde; } static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ #define FIRST_PROCESS_ENTRY 256 /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 #ifdef CONFIG_PAGE_MAPCOUNT /** * folio_precise_page_mapcount() - Number of mappings of this folio page. * @folio: The folio. * @page: The page. * * The number of present user page table entries that reference this page * as tracked via the RMAP: either referenced directly (PTE) or as part of * a larger area that covers this page (e.g., PMD). * * Use this function only for the calculation of existing statistics * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount). * * Do not add new users. * * Returns: The number of mappings of this folio page. 0 for * folios that are not mapped to user space or are not tracked via the RMAP * (e.g., shared zeropage). */ static inline int folio_precise_page_mapcount(struct folio *folio, struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; if (page_mapcount_is_type(mapcount)) mapcount = 0; if (folio_test_large(folio)) mapcount += folio_entire_mapcount(folio); return mapcount; } #else /* !CONFIG_PAGE_MAPCOUNT */ static inline int folio_precise_page_mapcount(struct folio *folio, struct page *page) { BUILD_BUG(); } #endif /* CONFIG_PAGE_MAPCOUNT */ /** * folio_average_page_mapcount() - Average number of mappings per page in this * folio * @folio: The folio. * * The average number of user page table entries that reference each page in * this folio as tracked via the RMAP: either referenced directly (PTE) or * as part of a larger area that covers this page (e.g., PMD). * * The average is calculated by rounding to the nearest integer; however, * to avoid duplicated code in current callers, the average is at least * 1 if any page of the folio is mapped. * * Returns: The average number of mappings per page in this folio. */ static inline int folio_average_page_mapcount(struct folio *folio) { int mapcount, entire_mapcount, avg; if (!folio_test_large(folio)) return atomic_read(&folio->_mapcount) + 1; mapcount = folio_large_mapcount(folio); if (unlikely(mapcount <= 0)) return 0; entire_mapcount = folio_entire_mapcount(folio); if (mapcount <= entire_mapcount) return entire_mapcount; mapcount -= entire_mapcount; /* Round to closest integer ... */ avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio); /* ... but return at least 1. */ return max_t(int, avg + entire_mapcount, 1); } /* * array.c */ extern const struct file_operations proc_tid_children_operations; extern void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape); extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_status(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); /* * base.c */ extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* * generic.c */ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry **parent, void *data); struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline void pde_get(struct proc_dir_entry *pde) { refcount_inc(&pde->refcnt); } extern void pde_put(struct proc_dir_entry *); static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); /* * inode.c */ struct pde_opener { struct list_head lh; struct file *file; bool closing; struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); /* * proc_namespaces.c */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; /* * proc_net.c */ extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; #ifdef CONFIG_NET extern int proc_net_init(void); #else static inline int proc_net_init(void) { return 0; } #endif /* * proc_self.c */ extern int proc_setup_self(struct super_block *); /* * proc_thread_self.c */ extern int proc_setup_thread_self(struct super_block *); extern void proc_thread_self_init(void); /* * proc_sysctl.c */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } static inline void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { } #endif /* * proc_tty.c */ #ifdef CONFIG_TTY extern void proc_tty_init(void); #else static inline void proc_tty_init(void) {} #endif /* * root.c */ extern struct proc_dir_entry proc_root; extern void proc_self_init(void); /* * task_[no]mmu.c */ struct mem_size_stats; struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; struct vma_iterator iter; loff_t last_pos; #ifdef CONFIG_PER_VMA_LOCK bool mmap_locked; struct vm_area_struct *locked_vma; #endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif } __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *); extern const struct dentry_operations proc_net_dentry_ops; static inline void pde_force_lookup(struct proc_dir_entry *pde) { /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ pde->flags |= PROC_ENTRY_FORCE_LOOKUP; } /* * Add a new procfs dentry that can't serve as a mountpoint. That should * encompass anything that is ephemeral and can just disappear while the * process is still around. */ static inline struct dentry *proc_splice_unmountable(struct inode *inode, struct dentry *dentry, const struct dentry_operations *d_ops) { dont_mount(dentry); return d_splice_alias_ops(inode, dentry, d_ops); }
16 12 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __PACKET_INTERNAL_H__ #define __PACKET_INTERNAL_H__ #include <linux/refcount.h> struct packet_mclist { struct packet_mclist *next; int ifindex; int count; unsigned short type; unsigned short alen; unsigned char addr[MAX_ADDR_LEN]; struct list_head remove_list; }; /* kbdq - kernel block descriptor queue */ struct tpacket_kbdq_core { struct pgv *pkbdq; unsigned int feature_req_word; unsigned int hdrlen; unsigned char reset_pending_on_curr_blk; unsigned char delete_blk_timer; unsigned short kactive_blk_num; unsigned short blk_sizeof_priv; /* last_kactive_blk_num: * trick to see if user-space has caught up * in order to avoid refreshing timer when every single pkt arrives. */ unsigned short last_kactive_blk_num; char *pkblk_start; char *pkblk_end; int kblk_size; unsigned int max_frame_len; unsigned int knum_blocks; uint64_t knxt_seq_num; char *prev; char *nxt_offset; struct sk_buff *skb; rwlock_t blk_fill_in_prog_lock; /* Default is set to 8ms */ #define DEFAULT_PRB_RETIRE_TOV (8) unsigned short retire_blk_tov; unsigned short version; unsigned long tov_in_jiffies; /* timer to retire an outstanding block */ struct timer_list retire_blk_timer; }; struct pgv { char *buffer; }; struct packet_ring_buffer { struct pgv *pg_vec; unsigned int head; unsigned int frames_per_block; unsigned int frame_size; unsigned int frame_max; unsigned int pg_vec_order; unsigned int pg_vec_pages; unsigned int pg_vec_len; unsigned int __percpu *pending_refcnt; union { unsigned long *rx_owner_map; struct tpacket_kbdq_core prb_bdqc; }; }; extern struct mutex fanout_mutex; #define PACKET_FANOUT_MAX (1 << 16) struct packet_fanout { possible_net_t net; unsigned int num_members; u32 max_num_members; u16 id; u8 type; u8 flags; union { atomic_t rr_cur; struct bpf_prog __rcu *bpf_prog; }; struct list_head list; spinlock_t lock; refcount_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; struct sock __rcu *arr[] __counted_by(max_num_members); }; struct packet_rollover { int sock; atomic_long_t num; atomic_long_t num_huge; atomic_long_t num_failed; #define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32)) u32 history[ROLLOVER_HLEN] ____cacheline_aligned; } ____cacheline_aligned_in_smp; struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; struct packet_fanout *fanout; union tpacket_stats_u stats; struct packet_ring_buffer rx_ring; struct packet_ring_buffer tx_ring; int copy_thresh; spinlock_t bind_lock; struct mutex pg_vec_lock; unsigned long flags; int ifindex; /* bound device */ u8 vnet_hdr_sz; __be16 num; struct packet_rollover *rollover; struct packet_mclist *mclist; atomic_long_t mapped; enum tpacket_versions tp_version; unsigned int tp_hdrlen; unsigned int tp_reserve; unsigned int tp_tstamp; struct completion skb_completion; struct net_device __rcu *cached_dev; struct packet_type prot_hook ____cacheline_aligned_in_smp; atomic_t tp_drops ____cacheline_aligned_in_smp; }; #define pkt_sk(ptr) container_of_const(ptr, struct packet_sock, sk) enum packet_sock_flags { PACKET_SOCK_ORIGDEV, PACKET_SOCK_AUXDATA, PACKET_SOCK_TX_HAS_OFF, PACKET_SOCK_TP_LOSS, PACKET_SOCK_RUNNING, PACKET_SOCK_PRESSURE, PACKET_SOCK_QDISC_BYPASS, }; static inline void packet_sock_flag_set(struct packet_sock *po, enum packet_sock_flags flag, bool val) { if (val) set_bit(flag, &po->flags); else clear_bit(flag, &po->flags); } static inline bool packet_sock_flag(const struct packet_sock *po, enum packet_sock_flags flag) { return test_bit(flag, &po->flags); } #endif
4 1 17 17 17 70 70 70 38 38 85 3 1 1 1 2 7 1 6 5 3 2 4 6 1 1 3 3 3 5 5 1 57 3 15 52 54 10 16 55 55 54 125 125 6 124 70 71 11 82 82 82 2 14 3 1 18 1 15 2 17 18 18 18 1 1 3 5 5 5 65 62 63 65 65 3 3 3 2 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Timer * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> */ #include <sound/core.h> #include <linux/slab.h> #include "seq_timer.h" #include "seq_queue.h" #include "seq_info.h" /* allowed sequencer timer frequencies, in Hz */ #define MIN_FREQUENCY 10 #define MAX_FREQUENCY 6250 #define DEFAULT_FREQUENCY 1000 #define SKEW_BASE 0x10000 /* 16bit shift */ static void snd_seq_timer_set_tick_resolution(struct snd_seq_timer *tmr) { unsigned int threshold = tmr->tempo_base == 1000 ? 1000000 : 10000; if (tmr->tempo < threshold) tmr->tick.resolution = (tmr->tempo * tmr->tempo_base) / tmr->ppq; else { /* might overflow.. */ unsigned int s; s = tmr->tempo % tmr->ppq; s = (s * tmr->tempo_base) / tmr->ppq; tmr->tick.resolution = (tmr->tempo / tmr->ppq) * tmr->tempo_base; tmr->tick.resolution += s; } if (tmr->tick.resolution <= 0) tmr->tick.resolution = 1; snd_seq_timer_update_tick(&tmr->tick, 0); } /* create new timer (constructor) */ struct snd_seq_timer *snd_seq_timer_new(void) { struct snd_seq_timer *tmr; tmr = kzalloc(sizeof(*tmr), GFP_KERNEL); if (!tmr) return NULL; spin_lock_init(&tmr->lock); /* reset setup to defaults */ snd_seq_timer_defaults(tmr); /* reset time */ snd_seq_timer_reset(tmr); return tmr; } /* delete timer (destructor) */ void snd_seq_timer_delete(struct snd_seq_timer **tmr) { struct snd_seq_timer *t = *tmr; *tmr = NULL; if (t == NULL) { pr_debug("ALSA: seq: snd_seq_timer_delete() called with NULL timer\n"); return; } t->running = 0; /* reset time */ snd_seq_timer_stop(t); snd_seq_timer_reset(t); kfree(t); } void snd_seq_timer_defaults(struct snd_seq_timer * tmr) { guard(spinlock_irqsave)(&tmr->lock); /* setup defaults */ tmr->ppq = 96; /* 96 PPQ */ tmr->tempo = 500000; /* 120 BPM */ tmr->tempo_base = 1000; /* 1us */ snd_seq_timer_set_tick_resolution(tmr); tmr->running = 0; tmr->type = SNDRV_SEQ_TIMER_ALSA; tmr->alsa_id.dev_class = seq_default_timer_class; tmr->alsa_id.dev_sclass = seq_default_timer_sclass; tmr->alsa_id.card = seq_default_timer_card; tmr->alsa_id.device = seq_default_timer_device; tmr->alsa_id.subdevice = seq_default_timer_subdevice; tmr->preferred_resolution = seq_default_timer_resolution; tmr->skew = tmr->skew_base = SKEW_BASE; } static void seq_timer_reset(struct snd_seq_timer *tmr) { /* reset time & songposition */ tmr->cur_time.tv_sec = 0; tmr->cur_time.tv_nsec = 0; tmr->tick.cur_tick = 0; tmr->tick.fraction = 0; } void snd_seq_timer_reset(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); seq_timer_reset(tmr); } /* called by timer interrupt routine. the period time since previous invocation is passed */ static void snd_seq_timer_interrupt(struct snd_timer_instance *timeri, unsigned long resolution, unsigned long ticks) { struct snd_seq_queue *q = timeri->callback_data; struct snd_seq_timer *tmr; if (q == NULL) return; tmr = q->timer; if (tmr == NULL) return; scoped_guard(spinlock_irqsave, &tmr->lock) { if (!tmr->running) return; resolution *= ticks; if (tmr->skew != tmr->skew_base) { /* FIXME: assuming skew_base = 0x10000 */ resolution = (resolution >> 16) * tmr->skew + (((resolution & 0xffff) * tmr->skew) >> 16); } /* update timer */ snd_seq_inc_time_nsec(&tmr->cur_time, resolution); /* calculate current tick */ snd_seq_timer_update_tick(&tmr->tick, resolution); /* register actual time of this timer update */ ktime_get_ts64(&tmr->last_update); } /* check queues and dispatch events */ snd_seq_check_queue(q, 1, 0); } /* set current tempo */ int snd_seq_timer_set_tempo(struct snd_seq_timer * tmr, int tempo) { if (snd_BUG_ON(!tmr)) return -EINVAL; if (tempo <= 0) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); if ((unsigned int)tempo != tmr->tempo) { tmr->tempo = tempo; snd_seq_timer_set_tick_resolution(tmr); } return 0; } /* set current tempo, ppq and base in a shot */ int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq, unsigned int tempo_base) { int changed; if (snd_BUG_ON(!tmr)) return -EINVAL; if (tempo <= 0 || ppq <= 0) return -EINVAL; /* allow only 10ns or 1us tempo base for now */ if (tempo_base && tempo_base != 10 && tempo_base != 1000) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); if (tmr->running && (ppq != tmr->ppq)) { /* refuse to change ppq on running timers */ /* because it will upset the song position (ticks) */ pr_debug("ALSA: seq: cannot change ppq of a running timer\n"); return -EBUSY; } changed = (tempo != tmr->tempo) || (ppq != tmr->ppq); tmr->tempo = tempo; tmr->ppq = ppq; tmr->tempo_base = tempo_base ? tempo_base : 1000; if (changed) snd_seq_timer_set_tick_resolution(tmr); return 0; } /* set current tick position */ int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position) { if (snd_BUG_ON(!tmr)) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); tmr->tick.cur_tick = position; tmr->tick.fraction = 0; return 0; } /* set current real-time position */ int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position) { if (snd_BUG_ON(!tmr)) return -EINVAL; snd_seq_sanity_real_time(&position); guard(spinlock_irqsave)(&tmr->lock); tmr->cur_time = position; return 0; } /* set timer skew */ int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base) { if (snd_BUG_ON(!tmr)) return -EINVAL; /* FIXME */ if (base != SKEW_BASE) { pr_debug("ALSA: seq: invalid skew base 0x%x\n", base); return -EINVAL; } guard(spinlock_irqsave)(&tmr->lock); tmr->skew = skew; return 0; } int snd_seq_timer_open(struct snd_seq_queue *q) { struct snd_timer_instance *t; struct snd_seq_timer *tmr; char str[32]; int err; tmr = q->timer; if (snd_BUG_ON(!tmr)) return -EINVAL; if (tmr->timeri) return -EBUSY; sprintf(str, "sequencer queue %i", q->queue); if (tmr->type != SNDRV_SEQ_TIMER_ALSA) /* standard ALSA timer */ return -EINVAL; if (tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_SLAVE) tmr->alsa_id.dev_sclass = SNDRV_TIMER_SCLASS_SEQUENCER; t = snd_timer_instance_new(str); if (!t) return -ENOMEM; t->callback = snd_seq_timer_interrupt; t->callback_data = q; t->flags |= SNDRV_TIMER_IFLG_AUTO; err = snd_timer_open(t, &tmr->alsa_id, q->queue); if (err < 0 && tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_SLAVE) { if (tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_GLOBAL || tmr->alsa_id.device != SNDRV_TIMER_GLOBAL_SYSTEM) { struct snd_timer_id tid; memset(&tid, 0, sizeof(tid)); tid.dev_class = SNDRV_TIMER_CLASS_GLOBAL; tid.dev_sclass = SNDRV_TIMER_SCLASS_SEQUENCER; tid.card = -1; tid.device = SNDRV_TIMER_GLOBAL_SYSTEM; err = snd_timer_open(t, &tid, q->queue); } } if (err < 0) { pr_err("ALSA: seq fatal error: cannot create timer (%i)\n", err); snd_timer_instance_free(t); return err; } scoped_guard(spinlock_irq, &tmr->lock) { if (tmr->timeri) err = -EBUSY; else tmr->timeri = t; } if (err < 0) { snd_timer_close(t); snd_timer_instance_free(t); return err; } return 0; } int snd_seq_timer_close(struct snd_seq_queue *q) { struct snd_seq_timer *tmr; struct snd_timer_instance *t; tmr = q->timer; if (snd_BUG_ON(!tmr)) return -EINVAL; scoped_guard(spinlock_irq, &tmr->lock) { t = tmr->timeri; tmr->timeri = NULL; } if (t) { snd_timer_close(t); snd_timer_instance_free(t); } return 0; } static int seq_timer_stop(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (!tmr->running) return 0; tmr->running = 0; snd_timer_pause(tmr->timeri); return 0; } int snd_seq_timer_stop(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_stop(tmr); } static int initialize_timer(struct snd_seq_timer *tmr) { struct snd_timer *t; unsigned long freq; t = tmr->timeri->timer; if (!t) return -EINVAL; freq = tmr->preferred_resolution; if (!freq) freq = DEFAULT_FREQUENCY; else if (freq < MIN_FREQUENCY) freq = MIN_FREQUENCY; else if (freq > MAX_FREQUENCY) freq = MAX_FREQUENCY; tmr->ticks = 1; if (!(t->hw.flags & SNDRV_TIMER_HW_SLAVE)) { unsigned long r = snd_timer_resolution(tmr->timeri); if (r) { tmr->ticks = (unsigned int)(1000000000uL / (r * freq)); if (! tmr->ticks) tmr->ticks = 1; } } tmr->initialized = 1; return 0; } static int seq_timer_start(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (tmr->running) seq_timer_stop(tmr); seq_timer_reset(tmr); if (initialize_timer(tmr) < 0) return -EINVAL; snd_timer_start(tmr->timeri, tmr->ticks); tmr->running = 1; ktime_get_ts64(&tmr->last_update); return 0; } int snd_seq_timer_start(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_start(tmr); } static int seq_timer_continue(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (tmr->running) return -EBUSY; if (! tmr->initialized) { seq_timer_reset(tmr); if (initialize_timer(tmr) < 0) return -EINVAL; } snd_timer_start(tmr->timeri, tmr->ticks); tmr->running = 1; ktime_get_ts64(&tmr->last_update); return 0; } int snd_seq_timer_continue(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_continue(tmr); } /* return current 'real' time. use timeofday() to get better granularity. */ snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, bool adjust_ktime) { snd_seq_real_time_t cur_time; guard(spinlock_irqsave)(&tmr->lock); cur_time = tmr->cur_time; if (adjust_ktime && tmr->running) { struct timespec64 tm; ktime_get_ts64(&tm); tm = timespec64_sub(tm, tmr->last_update); cur_time.tv_nsec += tm.tv_nsec; cur_time.tv_sec += tm.tv_sec; snd_seq_sanity_real_time(&cur_time); } return cur_time; } /* TODO: use interpolation on tick queue (will only be useful for very high PPQ values) */ snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return tmr->tick.cur_tick; } #ifdef CONFIG_SND_PROC_FS /* exported to seq_info.c */ void snd_seq_info_timer_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int idx; struct snd_seq_queue *q; struct snd_seq_timer *tmr; struct snd_timer_instance *ti; unsigned long resolution; for (idx = 0; idx < SNDRV_SEQ_MAX_QUEUES; idx++) { q = queueptr(idx); if (q == NULL) continue; scoped_guard(mutex, &q->timer_mutex) { tmr = q->timer; if (!tmr) break; ti = tmr->timeri; if (!ti) break; snd_iprintf(buffer, "Timer for queue %i : %s\n", q->queue, ti->timer->name); resolution = snd_timer_resolution(ti) * tmr->ticks; snd_iprintf(buffer, " Period time : %lu.%09lu\n", resolution / 1000000000, resolution % 1000000000); snd_iprintf(buffer, " Skew : %u / %u\n", tmr->skew, tmr->skew_base); } queuefree(q); } } #endif /* CONFIG_SND_PROC_FS */
70 70 492 492 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/inode.c * * Copyright (C) 1992 Rick Sladkey * * nfs inode and superblock handling functions * * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some * experimental NFS changes. Modularisation taken straight from SYS5 fs. * * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. * J.S.Peatfield@damtp.cam.ac.uk * */ #include <linux/module.h> #include <linux/init.h> #include <linux/sched/signal.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/nfs_xdr.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/freezer.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" #include "sysfs.h" #include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_VFS #define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 /* Default is to see 64-bit inode numbers */ static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; static int nfs_update_inode(struct inode *, struct nfs_fattr *); static struct kmem_cache * nfs_inode_cachep; static inline unsigned long nfs_fattr_to_ino_t(struct nfs_fattr *fattr) { return nfs_fileid_to_ino_t(fattr->fileid); } int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { if (unlikely(nfs_current_task_exiting())) return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid * * This function returns a 32-bit inode number if the boot parameter * nfs.enable_ino64 is zero. */ u64 nfs_compat_user_ino64(u64 fileid) { #ifdef CONFIG_COMPAT compat_ulong_t ino; #else unsigned long ino; #endif if (enable_ino64) return fileid; ino = fileid; if (sizeof(ino) < sizeof(fileid)) ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8; return ino; } int nfs_drop_inode(struct inode *inode) { return NFS_STALE(inode) || generic_drop_inode(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); void nfs_clear_inode(struct inode *inode) { /* * The following should never happen... */ WARN_ON_ONCE(nfs_have_writebacks(inode)); WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); nfs_fscache_clear_inode(inode); } EXPORT_SYMBOL_GPL(nfs_clear_inode); void nfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); nfs_clear_inode(inode); } int nfs_sync_inode(struct inode *inode) { inode_dio_wait(inode); return nfs_wb_all(inode); } EXPORT_SYMBOL_GPL(nfs_sync_inode); /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk * @mapping: pointer to struct address_space */ int nfs_sync_mapping(struct address_space *mapping) { int ret = 0; if (mapping->nrpages != 0) { unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_wb_all(mapping->host); } return ret; } static int nfs_attribute_timeout(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); } static bool nfs_check_cache_flags_invalid(struct inode *inode, unsigned long flags) { unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); return (cache_validity & flags) != 0; } bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) { if (nfs_check_cache_flags_invalid(inode, flags)) return true; return nfs_attribute_cache_expired(inode); } EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); #ifdef CONFIG_NFS_V4_2 static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) { return nfsi->xattr_cache != NULL; } #else static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) { return false; } #endif void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) { if (!(flags & NFS_INO_REVAL_FORCED)) flags &= ~(NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_BTIME | NFS_INO_INVALID_XATTR); flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); } if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode, 0); flags &= ~NFS_INO_REVAL_FORCED; flags |= nfsi->cache_validity; if (inode->i_mapping->nrpages == 0) flags &= ~NFS_INO_INVALID_DATA; /* pairs with nfs_clear_invalid_mapping()'s smp_load_acquire() */ smp_store_release(&nfsi->cache_validity, flags); if (inode->i_mapping->nrpages == 0 || nfsi->cache_validity & NFS_INO_INVALID_DATA) { nfs_ooo_clear(nfsi); } trace_nfs_set_cache_invalid(inode, 0); } EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); /* * Invalidate the local caches */ static void nfs_zap_caches_locked(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); int mode = inode->i_mode; nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR); nfs_zap_label_cache_locked(nfsi); } void nfs_zap_caches(struct inode *inode) { spin_lock(&inode->i_lock); nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { if (mapping->nrpages != 0) { spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); spin_unlock(&inode->i_lock); } } void nfs_zap_acl_cache(struct inode *inode) { void (*clear_acl_cache)(struct inode *); clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache; if (clear_acl_cache != NULL) clear_acl_cache(inode); spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { if (nfs_have_delegated_atime(inode)) return; spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_invalidate_atime); /* * Invalidate, but do not unhash, the inode. * NB: must be called with inode->i_lock held! */ static void nfs_set_inode_stale_locked(struct inode *inode) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); nfs_zap_caches_locked(inode); trace_nfs_set_inode_stale(inode); } void nfs_set_inode_stale(struct inode *inode) { spin_lock(&inode->i_lock); nfs_set_inode_stale_locked(inode); spin_unlock(&inode->i_lock); } struct nfs_find_desc { struct nfs_fh *fh; struct nfs_fattr *fattr; }; /* * In NFSv3 we can have 64bit inode numbers. In order to support * this, and re-exported directories (also seen in NFSv2) * we are forced to allow 2 different inodes to have the same * i_ino. */ static int nfs_find_actor(struct inode *inode, void *opaque) { struct nfs_find_desc *desc = opaque; struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; if (NFS_FILEID(inode) != fattr->fileid) return 0; if (inode_wrong_type(inode, fattr->mode)) return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; if (is_bad_inode(inode) || NFS_STALE(inode)) return 0; return 1; } static int nfs_init_locked(struct inode *inode, void *opaque) { struct nfs_find_desc *desc = opaque; struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); inode->i_mode = fattr->mode; nfs_copy_fh(NFS_FH(inode), desc->fh); return 0; } #ifdef CONFIG_NFS_V4_SECURITY_LABEL static void nfs_clear_label_invalid(struct inode *inode) { spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL; spin_unlock(&inode->i_lock); } void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr) { int error; if (fattr->label == NULL) return; if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { error = security_inode_notifysecctx(inode, fattr->label->label, fattr->label->len); if (error) printk(KERN_ERR "%s() %s %d " "security_inode_notifysecctx() %d\n", __func__, (char *)fattr->label->label, fattr->label->len, error); nfs_clear_label_invalid(inode); } } struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { struct nfs4_label *label; if (!(server->caps & NFS_CAP_SECURITY_LABEL)) return NULL; label = kzalloc(sizeof(struct nfs4_label), flags); if (label == NULL) return ERR_PTR(-ENOMEM); label->label = kzalloc(NFS4_MAXLABELLEN, flags); if (label->label == NULL) { kfree(label); return ERR_PTR(-ENOMEM); } label->len = NFS4_MAXLABELLEN; return label; } EXPORT_SYMBOL_GPL(nfs4_label_alloc); #else void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr) { } #endif EXPORT_SYMBOL_GPL(nfs_setsecurity); /* Search for inode identified by fh, fileid and i_mode in inode cache. */ struct inode * nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh) { struct nfs_find_desc desc = { .fh = fh, .fattr = fattr, }; struct inode *inode; unsigned long hash; if (!(fattr->valid & NFS_ATTR_FATTR_FILEID) || !(fattr->valid & NFS_ATTR_FATTR_TYPE)) return NULL; hash = nfs_fattr_to_ino_t(fattr); inode = ilookup5(sb, hash, nfs_find_actor, &desc); dprintk("%s: returning %p\n", __func__, inode); return inode; } static void nfs_inode_init_regular(struct nfs_inode *nfsi) { atomic_long_set(&nfsi->nrequests, 0); atomic_long_set(&nfsi->redirtied_pages, 0); INIT_LIST_HEAD(&nfsi->commit_info.list); atomic_long_set(&nfsi->commit_info.ncommit, 0); atomic_set(&nfsi->commit_info.rpcs_out, 0); mutex_init(&nfsi->commit_mutex); } static void nfs_inode_init_dir(struct nfs_inode *nfsi) { nfsi->cache_change_attribute = 0; memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); init_rwsem(&nfsi->rmdir_sem); } /* * This is our front-end to iget that looks up inodes by file handle * instead of inode number. */ struct inode * nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) { struct nfs_find_desc desc = { .fh = fh, .fattr = fattr }; struct inode *inode = ERR_PTR(-ENOENT); u64 fattr_supported = NFS_SB(sb)->fattr_valid; unsigned long hash; nfs_attr_check_mountpoint(sb, fattr); if (nfs_attr_use_mounted_on_fileid(fattr)) fattr->fileid = fattr->mounted_on_fileid; else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; hash = nfs_fattr_to_ino_t(fattr); inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc); if (inode == NULL) { inode = ERR_PTR(-ENOMEM); goto out_no_inode; } if (inode->i_state & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; /* We set i_ino for the few things that still rely on it, * such as stat(2) */ inode->i_ino = hash; /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; nfsi->cache_validity = 0; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && (fattr_supported & NFS_ATTR_FATTR_MODE)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; nfs_inode_init_regular(nfsi); mapping_set_large_folios(inode->i_mapping); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; inode->i_data.a_ops = &nfs_dir_aops; nfs_inode_init_dir(nfsi); /* Deal with crossing mountpoints */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else inode->i_op = &nfs_mountpoint_inode_operations; inode->i_fop = NULL; inode->i_flags |= S_AUTOMOUNT; } } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &nfs_symlink_inode_operations; inode_nohighmem(inode); } else init_special_inode(inode, inode->i_mode, fattr->rdev); inode_set_atime(inode, 0, 0); inode_set_mtime(inode, 0, 0); inode_set_ctime(inode, 0, 0); memset(&nfsi->btime, 0, sizeof(nfsi->btime)); inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); inode->i_uid = make_kuid(&init_user_ns, -2); inode->i_gid = make_kgid(&init_user_ns, -2); inode->i_blocks = 0; nfsi->write_io = 0; nfsi->read_io = 0; nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_BTIME) nfsi->btime = fattr->btime; else if (fattr_supported & NFS_ATTR_FATTR_BTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_BTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); else set_nlink(inode, 1); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED && fattr->size != 0) nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED && fattr->size != 0) nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); nfs_setsecurity(inode, fattr); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; nfsi->access_cache = RB_ROOT; nfs_fscache_init_inode(inode); unlock_new_inode(inode); } else { int err = nfs_refresh_inode(inode, fattr); if (err < 0) { iput(inode); inode = ERR_PTR(err); goto out_no_inode; } } dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), atomic_read(&inode->i_count)); out: return inode; out_no_inode: dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode)); goto out; } EXPORT_SYMBOL_GPL(nfs_fhget); static void nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) { unsigned long cache_validity = NFS_I(inode)->cache_validity; if (nfs_have_delegated_mtime(inode)) { if (!(cache_validity & NFS_INO_INVALID_CTIME)) fattr->valid &= ~(NFS_ATTR_FATTR_PRECTIME | NFS_ATTR_FATTR_CTIME); if (!(cache_validity & NFS_INO_INVALID_MTIME)) fattr->valid &= ~(NFS_ATTR_FATTR_PREMTIME | NFS_ATTR_FATTR_MTIME); if (!(cache_validity & NFS_INO_INVALID_ATIME)) fattr->valid &= ~NFS_ATTR_FATTR_ATIME; } else if (nfs_have_delegated_atime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) fattr->valid &= ~NFS_ATTR_FATTR_ATIME; } } static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr) { unsigned int cache_flags = 0; if (attr->ia_valid & ATTR_MTIME_SET) { struct timespec64 ctime = inode_get_ctime(inode); struct timespec64 mtime = inode_get_mtime(inode); struct timespec64 now; int updated = 0; now = inode_set_ctime_current(inode); if (!timespec64_equal(&now, &ctime)) updated |= S_CTIME; inode_set_mtime_to_ts(inode, attr->ia_mtime); if (!timespec64_equal(&now, &mtime)) updated |= S_MTIME; inode_maybe_inc_iversion(inode, updated); cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; } if (attr->ia_valid & ATTR_ATIME_SET) { inode_set_atime_to_ts(inode, attr->ia_atime); cache_flags |= NFS_INO_INVALID_ATIME; } NFS_I(inode)->cache_validity &= ~cache_flags; } static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) { enum file_time_flags time_flags = 0; unsigned int cache_flags = 0; if (ia_valid & ATTR_MTIME) { time_flags |= S_MTIME | S_CTIME; cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; } if (ia_valid & ATTR_ATIME) { time_flags |= S_ATIME; cache_flags |= NFS_INO_INVALID_ATIME; } inode_update_timestamps(inode, time_flags); NFS_I(inode)->cache_validity &= ~cache_flags; } void nfs_update_delegated_atime(struct inode *inode) { spin_lock(&inode->i_lock); if (nfs_have_delegated_atime(inode)) nfs_update_timestamps(inode, ATTR_ATIME); spin_unlock(&inode->i_lock); } void nfs_update_delegated_mtime_locked(struct inode *inode) { if (nfs_have_delegated_mtime(inode)) nfs_update_timestamps(inode, ATTR_MTIME); } void nfs_update_delegated_mtime(struct inode *inode) { spin_lock(&inode->i_lock); nfs_update_delegated_mtime_locked(inode); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_update_delegated_mtime); #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); /* skip mode change if it's just for clearing setuid/setgid */ if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { BUG_ON(!S_ISREG(inode->i_mode)); error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; if (attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; } if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); if (attr->ia_valid & ATTR_MTIME_SET) { nfs_set_timestamps_to_ts(inode, attr); attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| ATTR_ATIME|ATTR_ATIME_SET); } else { nfs_update_timestamps(inode, attr->ia_valid); attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); } spin_unlock(&inode->i_lock); } else if (nfs_have_delegated_atime(inode) && attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { if (attr->ia_valid & ATTR_ATIME_SET) { spin_lock(&inode->i_lock); nfs_set_timestamps_to_ts(inode, attr); spin_unlock(&inode->i_lock); attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); } else { nfs_update_delegated_atime(inode); attr->ia_valid &= ~ATTR_ATIME; } } /* Optimization: if the end result is no change, don't RPC */ if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; trace_nfs_setattr_enter(inode); /* Write all dirty data */ if (S_ISREG(inode->i_mode)) nfs_sync_inode(inode); fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { error = -ENOMEM; goto out; } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) error = nfs_refresh_inode(inode, fattr); nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); return error; } EXPORT_SYMBOL_GPL(nfs_setattr); /** * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used * @offset: file offset to start truncating * * This is a copy of the common vmtruncate, but with the locking * corrected to take into account the fact that NFS requires * inode->i_size to be updated under the inode->i_lock. * Note: must be called with inode->i_lock held! */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { int err; err = inode_newsize_ok(inode, offset); if (err) goto out; trace_nfs_size_truncate(inode, offset); i_size_write(inode, offset); /* Optimisation */ if (offset == 0) { NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; nfs_ooo_clear(NFS_I(inode)); } NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); nfs_update_delegated_mtime_locked(inode); spin_lock(&inode->i_lock); out: return err; } /** * nfs_setattr_update_inode - Update inode metadata after a setattr call. * @inode: pointer to struct inode * @attr: pointer to struct iattr * @fattr: pointer to struct nfs_fattr * * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. */ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *fattr) { /* Barrier: bump the attribute generation count. */ nfs_fattr_set_barrier(fattr); spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & ATTR_SIZE) != 0) { if (!nfs_have_delegated_mtime(inode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME; if ((attr->ia_valid & ATTR_KILL_SUID) != 0 && inode->i_mode & S_ISUID) inode->i_mode &= ~S_ISUID; if (setattr_should_drop_sgid(&nop_mnt_idmap, inode)) inode->i_mode &= ~S_ISGID; if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } if ((attr->ia_valid & ATTR_UID) != 0) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); } if (attr->ia_valid & (ATTR_ATIME_SET|ATTR_ATIME)) { NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode_set_atime_to_ts(inode, fattr->atime); else if (attr->ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); } if (attr->ia_valid & (ATTR_MTIME_SET|ATTR_MTIME)) { NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode_set_mtime_to_ts(inode, fattr->mtime); else if (attr->ia_valid & ATTR_MTIME_SET) inode_set_mtime_to_ts(inode, attr->ia_mtime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); } if (fattr->valid) nfs_update_inode(inode, fattr); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); /* * Don't request help from readdirplus if the file is being written to, * or if attribute caching is turned off */ static bool nfs_getattr_readdirplus_enable(const struct inode *inode) { return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) && !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ; } static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) { if (!IS_ROOT(dentry)) { struct dentry *parent = dget_parent(dentry); nfs_readdir_record_entry_cache_miss(d_inode(parent)); dput(parent); } } static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) { if (!IS_ROOT(dentry)) { struct dentry *parent = dget_parent(dentry); nfs_readdir_record_entry_cache_hit(d_inode(parent)); dput(parent); } } static u32 nfs_get_valid_attrmask(struct inode *inode) { u64 fattr_valid = NFS_SERVER(inode)->fattr_valid; unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); u32 reply_mask = STATX_INO | STATX_TYPE; if (!(cache_validity & NFS_INO_INVALID_ATIME)) reply_mask |= STATX_ATIME; if (!(cache_validity & NFS_INO_INVALID_CTIME)) reply_mask |= STATX_CTIME; if (!(cache_validity & NFS_INO_INVALID_MTIME)) reply_mask |= STATX_MTIME; if (!(cache_validity & NFS_INO_INVALID_SIZE)) reply_mask |= STATX_SIZE; if (!(cache_validity & NFS_INO_INVALID_NLINK)) reply_mask |= STATX_NLINK; if (!(cache_validity & NFS_INO_INVALID_MODE)) reply_mask |= STATX_MODE; if (!(cache_validity & NFS_INO_INVALID_OTHER)) reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; if (!(cache_validity & NFS_INO_INVALID_BTIME) && (fattr_valid & NFS_ATTR_FATTR_BTIME)) reply_mask |= STATX_BTIME; if (!(cache_validity & NFS_INO_INVALID_CHANGE)) reply_mask |= STATX_CHANGE_COOKIE; return reply_mask; } int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct nfs_server *server = NFS_SERVER(inode); u64 fattr_valid = server->fattr_valid; unsigned long cache_validity; int err = 0; bool force_sync = query_flags & AT_STATX_FORCE_SYNC; bool do_update = false; bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode); trace_nfs_getattr_enter(inode); request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | STATX_CHANGE_COOKIE; if (!(fattr_valid & NFS_ATTR_FATTR_BTIME)) request_mask &= ~STATX_BTIME; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { if (readdirplus_enabled) nfs_readdirplus_parent_cache_hit(path->dentry); goto out_no_revalidate; } /* Flush out writes to the server in order to update c/mtime/version. */ if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && S_ISREG(inode->i_mode)) { if (nfs_have_delegated_mtime(inode)) filemap_fdatawrite(inode->i_mapping); else filemap_write_and_wait(inode->i_mapping); } /* * We may force a getattr if the user cares about atime. * * Note that we only have to check the vfsmount flags here: * - NFS always sets S_NOATIME by so checking it would give a * bogus result * - NFS never sets SB_NOATIME or SB_NODIRATIME so there is * no point in checking those. */ if ((path->mnt->mnt_flags & MNT_NOATIME) || ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) request_mask &= ~STATX_ATIME; /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| STATX_SIZE|STATX_BLOCKS|STATX_BTIME| STATX_CHANGE_COOKIE))) goto out_no_revalidate; /* Check whether the cached attributes are stale */ do_update |= force_sync || nfs_attribute_cache_expired(inode); cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); do_update |= cache_validity & NFS_INO_INVALID_CHANGE; if (request_mask & STATX_ATIME) do_update |= cache_validity & NFS_INO_INVALID_ATIME; if (request_mask & STATX_CTIME) do_update |= cache_validity & NFS_INO_INVALID_CTIME; if (request_mask & STATX_MTIME) do_update |= cache_validity & NFS_INO_INVALID_MTIME; if (request_mask & STATX_SIZE) do_update |= cache_validity & NFS_INO_INVALID_SIZE; if (request_mask & STATX_NLINK) do_update |= cache_validity & NFS_INO_INVALID_NLINK; if (request_mask & STATX_MODE) do_update |= cache_validity & NFS_INO_INVALID_MODE; if (request_mask & (STATX_UID | STATX_GID)) do_update |= cache_validity & NFS_INO_INVALID_OTHER; if (request_mask & STATX_BLOCKS) do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; if (request_mask & STATX_BTIME) do_update |= cache_validity & NFS_INO_INVALID_BTIME; if (do_update) { if (readdirplus_enabled) nfs_readdirplus_parent_cache_miss(path->dentry); err = __nfs_revalidate_inode(server, inode); if (err) goto out; } else if (readdirplus_enabled) nfs_readdirplus_parent_cache_hit(path->dentry); out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); stat->change_cookie = inode_peek_iversion_raw(inode); stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; stat->btime = NFS_I(inode)->btime; out: trace_nfs_getattr_exit(inode, err); return err; } EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { refcount_set(&l_ctx->count, 1); l_ctx->lockowner = current->files; INIT_LIST_HEAD(&l_ctx->list); atomic_set(&l_ctx->io_count, 0); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *pos; list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) { if (pos->lockowner != current->files) continue; if (refcount_inc_not_zero(&pos->count)) return pos; } return NULL; } struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *res, *new = NULL; struct inode *inode = d_inode(ctx->dentry); rcu_read_lock(); res = __nfs_find_lock_context(ctx); rcu_read_unlock(); if (res == NULL) { new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); if (new == NULL) return ERR_PTR(-ENOMEM); nfs_init_lock_context(new); spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); if (res == NULL) { new->open_context = get_nfs_open_context(ctx); if (new->open_context) { list_add_tail_rcu(&new->list, &ctx->lock_context.list); res = new; new = NULL; } else res = ERR_PTR(-EBADF); } spin_unlock(&inode->i_lock); kfree(new); } return res; } EXPORT_SYMBOL_GPL(nfs_get_lock_context); void nfs_put_lock_context(struct nfs_lock_context *l_ctx) { struct nfs_open_context *ctx = l_ctx->open_context; struct inode *inode = d_inode(ctx->dentry); if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; list_del_rcu(&l_ctx->list); spin_unlock(&inode->i_lock); put_nfs_open_context(ctx); kfree_rcu(l_ctx, rcu_head); } EXPORT_SYMBOL_GPL(nfs_put_lock_context); /** * nfs_close_context - Common close_context() routine NFSv2/v3 * @ctx: pointer to context * @is_sync: is this a synchronous close * * Ensure that the attributes are up to date if we're mounted * with close-to-open semantics and we have cached data that will * need to be revalidated on open. */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) { struct nfs_inode *nfsi; struct inode *inode; if (!(ctx->mode & FMODE_WRITE)) return; if (!is_sync) return; inode = d_inode(ctx->dentry); if (nfs_have_read_or_write_delegation(inode)) return; nfsi = NFS_I(inode); if (inode->i_mapping->nrpages == 0) return; if (nfsi->cache_validity & NFS_INO_INVALID_DATA) return; if (!list_empty(&nfsi->open_files)) return; if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO) return; nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); } EXPORT_SYMBOL_GPL(nfs_close_context); struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode, struct file *filp) { struct nfs_open_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (!ctx) return ERR_PTR(-ENOMEM); nfs_sb_active(dentry->d_sb); ctx->dentry = dget(dentry); if (filp) ctx->cred = get_cred(filp->f_cred); else ctx->cred = get_current_cred(); rcu_assign_pointer(ctx->ll_cred, NULL); ctx->state = NULL; ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; ctx->flock_owner = (fl_owner_t)filp; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); ctx->mdsthreshold = NULL; nfs_localio_file_init(&ctx->nfl); return ctx; } EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count)) return ctx; return NULL; } EXPORT_SYMBOL_GPL(get_nfs_open_context); static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; if (!refcount_dec_and_test(&ctx->lock_context.count)) return; if (!list_empty(&ctx->list)) { spin_lock(&inode->i_lock); list_del_rcu(&ctx->list); spin_unlock(&inode->i_lock); } if (inode != NULL) NFS_PROTO(inode)->close_context(ctx, is_sync); put_cred(ctx->cred); dput(ctx->dentry); nfs_sb_deactive(sb); put_rpccred(rcu_dereference_protected(ctx->ll_cred, 1)); kfree(ctx->mdsthreshold); nfs_close_local_fh(&ctx->nfl); kfree_rcu(ctx, rcu_head); } void put_nfs_open_context(struct nfs_open_context *ctx) { __put_nfs_open_context(ctx, 0); } EXPORT_SYMBOL_GPL(put_nfs_open_context); static void put_nfs_open_context_sync(struct nfs_open_context *ctx) { __put_nfs_open_context(ctx, 1); } /* * Ensure that mmap has a recent RPC credential for use when writing out * shared pages */ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) { struct inode *inode = d_inode(ctx->dentry); struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); if (list_empty(&nfsi->open_files) && nfs_ooo_test(nfsi)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { filp->private_data = get_nfs_open_context(ctx); set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); if (list_empty(&ctx->list)) nfs_inode_attach_open_context(ctx); } EXPORT_SYMBOL_GPL(nfs_file_set_open_context); /* * Given an inode, search for an open context with the desired characteristics */ struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *pos, *ctx = NULL; rcu_read_lock(); list_for_each_entry_rcu(pos, &nfsi->open_files, list) { if (cred != NULL && cred_fscmp(pos->cred, cred) != 0) continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags)) continue; ctx = get_nfs_open_context(pos); if (ctx) break; } rcu_read_unlock(); return ctx; } void nfs_file_clear_open_context(struct file *filp) { struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { struct inode *inode = d_inode(ctx->dentry); clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); /* * We fatal error on write before. Try to writeback * every page again. */ if (ctx->error < 0) invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; put_nfs_open_context_sync(ctx); } } /* * These allocate and release file read/write context information. */ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; ctx = alloc_nfs_open_context(file_dentry(filp), flags_to_mode(filp->f_flags), filp); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); nfs_fscache_open_file(inode, filp); return 0; } /* * This function is called whenever some part of NFS notices that * the cached attributes have to be refreshed. */ int __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { int status = -ESTALE; struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); trace_nfs_revalidate_inode_enter(inode); if (is_bad_inode(inode)) goto out; if (NFS_STALE(inode)) goto out; /* pNFS: Attributes aren't updated until we layoutcommit */ if (S_ISREG(inode->i_mode)) { status = pnfs_sync_inode(inode, false); if (status) goto out; } status = -ENOMEM; fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) goto out; nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, inode); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), status); switch (status) { case -ETIMEDOUT: /* A soft timeout occurred. Use cached information? */ if (server->flags & NFS_MOUNT_SOFTREVAL) status = 0; break; case -ESTALE: if (!S_ISDIR(inode->i_mode)) nfs_set_inode_stale(inode); else nfs_zap_caches(inode); } goto out; } status = nfs_refresh_inode(inode, fattr); if (status) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), status); goto out; } if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); nfs_setsecurity(inode, fattr); dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); out: nfs_free_fattr(fattr); trace_nfs_revalidate_inode_exit(inode, status); return status; } int nfs_attribute_cache_expired(struct inode *inode) { if (nfs_have_delegated_attributes(inode)) return 0; return nfs_attribute_timeout(inode); } /** * nfs_revalidate_inode - Revalidate the inode attributes * @inode: pointer to inode struct * @flags: cache flags to check * * Updates inode attribute information by retrieving the data from the server. */ int nfs_revalidate_inode(struct inode *inode, unsigned long flags) { if (!nfs_check_cache_invalid(inode, flags)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(NFS_SERVER(inode), inode); } EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { int ret; nfs_fscache_invalidate(inode, 0); if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; } ret = invalidate_inode_pages2(mapping); if (ret < 0) return ret; } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); return 0; } /** * nfs_clear_invalid_mapping - Conditionally clear a mapping * @mapping: pointer to mapping * * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping. */ int nfs_clear_invalid_mapping(struct address_space *mapping) { struct inode *inode = mapping->host; struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; int ret = 0; /* * We must clear NFS_INO_INVALID_DATA first to ensure that * invalidations that come in while we're shooting down the mappings * are respected. But, that leaves a race window where one revalidator * can clear the flag, and then another checks it before the mapping * gets invalidated. Fix that by serializing access to this part of * the function. * * At the same time, we need to allow other tasks to see whether we * might be in the middle of invalidating the pages, so we only set * the bit lock here if it looks like we're going to be doing that. */ for (;;) { ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, nfs_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; smp_rmb(); /* pairs with smp_wmb() below */ if (test_bit(NFS_INO_INVALIDATING, bitlock)) continue; /* pairs with nfs_set_cache_invalid()'s smp_store_release() */ if (!(smp_load_acquire(&nfsi->cache_validity) & NFS_INO_INVALID_DATA)) goto out; /* Slow-path that double-checks with spinlock held */ spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock)) { spin_unlock(&inode->i_lock); continue; } if (nfsi->cache_validity & NFS_INO_INVALID_DATA) break; spin_unlock(&inode->i_lock); goto out; } set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; nfs_ooo_clear(nfsi); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_INVALIDATING); out: return ret; } bool nfs_mapping_need_revalidate_inode(struct inode *inode) { return nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE) || NFS_STALE(inode); } int nfs_revalidate_mapping_rcu(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; int ret = 0; if (IS_SWAPFILE(inode)) goto out; if (nfs_mapping_need_revalidate_inode(inode)) { ret = -ECHILD; goto out; } spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock) || (nfsi->cache_validity & NFS_INO_INVALID_DATA)) ret = -ECHILD; spin_unlock(&inode->i_lock); out: return ret; } /** * nfs_revalidate_mapping - Revalidate the pagecache * @inode: pointer to host inode * @mapping: pointer to mapping */ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { /* swapfiles are not supposed to be shared. */ if (IS_SWAPFILE(inode)) return 0; if (nfs_mapping_need_revalidate_inode(inode)) { int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) return ret; } return nfs_clear_invalid_mapping(mapping); } static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; if (!S_ISREG(inode->i_mode)) return false; if (list_empty(&nfsi->open_files)) return false; return inode_is_open_for_write(inode); } static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) { return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct timespec64 ts; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) { inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); else if (nfs_server_capable(inode, NFS_CAP_XATTR)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); } /* If we have atomic WCC data, we may update some attributes */ ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec64_equal(&ts, &fattr->pre_ctime)) { inode_set_ctime_to_ts(inode, fattr->ctime); } ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) && timespec64_equal(&ts, &fattr->pre_mtime)) { inode_set_mtime_to_ts(inode, fattr->mtime); } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && !nfs_have_writebacks(inode)) { trace_nfs_size_wcc(inode, fattr->size); i_size_write(inode, nfs_size_to_loff_t(fattr->size)); } } /** * nfs_check_inode_attributes - verify consistency of the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * Verifies the attribute cache. If we have just changed the attributes, * so that fattr carries weak cache consistency data, then it may * also update the ctime/mtime/change_attribute. */ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; unsigned long invalid = 0; struct timespec64 ts; if (nfs_have_delegated_attributes(inode)) return 0; if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) return 0; /* Has the inode gone and changed behind our back? */ } else if (nfsi->fileid != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && nfsi->fileid == fattr->mounted_on_fileid) return 0; return -ESTALE; } if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) return -ESTALE; if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) invalid |= NFS_INO_INVALID_CHANGE; ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); if (cur_size != new_isize) invalid |= NFS_INO_INVALID_SIZE; } } /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) invalid |= NFS_INO_INVALID_MODE; if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) invalid |= NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) invalid |= NFS_INO_INVALID_OTHER; /* Has the link count changed? */ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_NLINK; ts = inode_get_atime(inode); if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; } static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) { return atomic_long_read(&nfs_attr_generation_counter); } unsigned long nfs_inc_attr_generation_counter(void) { return atomic_long_inc_return(&nfs_attr_generation_counter); } EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter); void nfs_fattr_init(struct nfs_fattr *fattr) { fattr->valid = 0; fattr->time_start = jiffies; fattr->gencount = nfs_inc_attr_generation_counter(); fattr->owner_name = NULL; fattr->group_name = NULL; fattr->mdsthreshold = NULL; } EXPORT_SYMBOL_GPL(nfs_fattr_init); /** * nfs_fattr_set_barrier * @fattr: attributes * * Used to set a barrier after an attribute was updated. This * barrier ensures that older attributes from RPC calls that may * have raced with our update cannot clobber these new values. * Note that you are still responsible for ensuring that other * operations which change the attribute on the server do not * collide. */ void nfs_fattr_set_barrier(struct nfs_fattr *fattr) { fattr->gencount = nfs_inc_attr_generation_counter(); } struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; fattr = kmalloc(sizeof(*fattr), GFP_KERNEL); if (fattr != NULL) { nfs_fattr_init(fattr); fattr->label = NULL; } return fattr; } EXPORT_SYMBOL_GPL(nfs_alloc_fattr); struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server) { struct nfs_fattr *fattr = nfs_alloc_fattr(); if (!fattr) return NULL; fattr->label = nfs4_label_alloc(server, GFP_KERNEL); if (IS_ERR(fattr->label)) { kfree(fattr); return NULL; } return fattr; } EXPORT_SYMBOL_GPL(nfs_alloc_fattr_with_label); struct nfs_fh *nfs_alloc_fhandle(void) { struct nfs_fh *fh; fh = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); if (fh != NULL) fh->size = 0; return fh; } EXPORT_SYMBOL_GPL(nfs_alloc_fhandle); #ifdef NFS_DEBUG /* * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle * in the same way that wireshark does * * @fh: file handle * * For debugging only. */ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) { /* wireshark uses 32-bit AUTODIN crc and does a bitwise * not on the result */ return nfs_fhandle_hash(fh); } EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash); /* * _nfs_display_fhandle - display an NFS file handle on the console * * @fh: file handle to display * @caption: display caption * * For debugging only. */ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) { unsigned short i; if (fh == NULL || fh->size == 0) { printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh); return; } printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n", caption, fh, fh->size, _nfs_display_fhandle_hash(fh)); for (i = 0; i < fh->size; i += 16) { __be32 *pos = (__be32 *)&fh->data[i]; switch ((fh->size - i - 1) >> 2) { case 0: printk(KERN_DEFAULT " %08x\n", be32_to_cpup(pos)); break; case 1: printk(KERN_DEFAULT " %08x %08x\n", be32_to_cpup(pos), be32_to_cpup(pos + 1)); break; case 2: printk(KERN_DEFAULT " %08x %08x %08x\n", be32_to_cpup(pos), be32_to_cpup(pos + 1), be32_to_cpup(pos + 2)); break; default: printk(KERN_DEFAULT " %08x %08x %08x %08x\n", be32_to_cpup(pos), be32_to_cpup(pos + 1), be32_to_cpup(pos + 2), be32_to_cpup(pos + 3)); } } } EXPORT_SYMBOL_GPL(_nfs_display_fhandle); #endif /** * nfs_inode_attrs_cmp_generic - compare attributes * @fattr: attributes * @inode: pointer to inode * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. * Note also the check for wraparound of 'attr_gencount' * * The function returns '1' if it thinks the attributes in @fattr are * more recent than the ones cached in @inode. Otherwise it returns * the value '0'. */ static int nfs_inode_attrs_cmp_generic(const struct nfs_fattr *fattr, const struct inode *inode) { unsigned long attr_gencount = NFS_I(inode)->attr_gencount; return (long)(fattr->gencount - attr_gencount) > 0 || (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0; } /** * nfs_inode_attrs_cmp_monotonic - compare attributes * @fattr: attributes * @inode: pointer to inode * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. * * We assume that the server observes monotonic semantics for * the change attribute, so a larger value means that the attributes in * @fattr are more recent, in which case the function returns the * value '1'. * A return value of '0' indicates no measurable change * A return value of '-1' means that the attributes in @inode are * more recent. */ static int nfs_inode_attrs_cmp_monotonic(const struct nfs_fattr *fattr, const struct inode *inode) { s64 diff = fattr->change_attr - inode_peek_iversion_raw(inode); if (diff > 0) return 1; return diff == 0 ? 0 : -1; } /** * nfs_inode_attrs_cmp_strict_monotonic - compare attributes * @fattr: attributes * @inode: pointer to inode * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. * * We assume that the server observes strictly monotonic semantics for * the change attribute, so a larger value means that the attributes in * @fattr are more recent, in which case the function returns the * value '1'. * A return value of '-1' means that the attributes in @inode are * more recent or unchanged. */ static int nfs_inode_attrs_cmp_strict_monotonic(const struct nfs_fattr *fattr, const struct inode *inode) { return nfs_inode_attrs_cmp_monotonic(fattr, inode) > 0 ? 1 : -1; } /** * nfs_inode_attrs_cmp - compare attributes * @fattr: attributes * @inode: pointer to inode * * This function returns '1' if it thinks the attributes in @fattr are * more recent than the ones cached in @inode. It returns '-1' if * the attributes in @inode are more recent than the ones in @fattr, * and it returns 0 if not sure. */ static int nfs_inode_attrs_cmp(const struct nfs_fattr *fattr, const struct inode *inode) { if (nfs_inode_attrs_cmp_generic(fattr, inode) > 0) return 1; switch (NFS_SERVER(inode)->change_attr_type) { case NFS4_CHANGE_TYPE_IS_UNDEFINED: break; case NFS4_CHANGE_TYPE_IS_TIME_METADATA: if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE)) break; return nfs_inode_attrs_cmp_monotonic(fattr, inode); default: if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE)) break; return nfs_inode_attrs_cmp_strict_monotonic(fattr, inode); } return 0; } /** * nfs_inode_finish_partial_attr_update - complete a previous inode update * @fattr: attributes * @inode: pointer to inode * * Returns '1' if the last attribute update left the inode cached * attributes in a partially unrevalidated state, and @fattr * matches the change attribute of that partial update. * Otherwise returns '0'. */ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr, const struct inode *inode) { const unsigned long check_valid = NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_NLINK | NFS_INO_INVALID_BTIME; unsigned long cache_validity = NFS_I(inode)->cache_validity; enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type; if (ctype != NFS4_CHANGE_TYPE_IS_UNDEFINED && !(cache_validity & NFS_INO_INVALID_CHANGE) && (cache_validity & check_valid) != 0 && (fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && nfs_inode_attrs_cmp_monotonic(fattr, inode) == 0) return 1; return 0; } static void nfs_ooo_merge(struct nfs_inode *nfsi, u64 start, u64 end) { int i, cnt; if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) /* No point merging anything */ return; if (!nfsi->ooo) { nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC); if (!nfsi->ooo) { nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; return; } nfsi->ooo->cnt = 0; } /* add this range, merging if possible */ cnt = nfsi->ooo->cnt; for (i = 0; i < cnt; i++) { if (end == nfsi->ooo->gap[i].start) end = nfsi->ooo->gap[i].end; else if (start == nfsi->ooo->gap[i].end) start = nfsi->ooo->gap[i].start; else continue; /* Remove 'i' from table and loop to insert the new range */ cnt -= 1; nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt]; i = -1; } if (start != end) { if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) { nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; kfree(nfsi->ooo); nfsi->ooo = NULL; return; } nfsi->ooo->gap[cnt].start = start; nfsi->ooo->gap[cnt].end = end; cnt += 1; } nfsi->ooo->cnt = cnt; } static void nfs_ooo_record(struct nfs_inode *nfsi, struct nfs_fattr *fattr) { /* This reply was out-of-order, so record in the * pre/post change id, possibly cancelling * gaps created when iversion was jumpped forward. */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) nfs_ooo_merge(nfsi, fattr->change_attr, fattr->pre_change_attr); } static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int attr_cmp = nfs_inode_attrs_cmp(fattr, inode); int ret = 0; trace_nfs_refresh_inode_enter(inode); if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode)) ret = nfs_update_inode(inode, fattr); else { nfs_ooo_record(NFS_I(inode), fattr); if (attr_cmp == 0) ret = nfs_check_inode_attributes(inode, fattr); } trace_nfs_refresh_inode_exit(inode, ret); return ret; } /** * nfs_refresh_inode - try to update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * Check that an RPC call that returned attributes has not overlapped with * other recent updates of the inode metadata, then decide whether it is * safe to do a full update of the inode attributes, or whether just to * call nfs_check_inode_attributes. */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { int status; if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr, unsigned int invalid) { if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; return nfs_refresh_inode_locked(inode, fattr); } /** * nfs_post_op_update_inode - try to update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. * * NB: if the server didn't return any post op attributes, this * function will force the retrieval of attributes before the next * NFS request. Thus it should be used only for operations that * are expected to change one or more attributes, to avoid * unnecessary NFS requests and trips through nfs_update_inode(). */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { int status; spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_locked(inode, fattr, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); return status; } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up * weak cache consistency data, if none exist. * * This function is mainly designed to be used by the ->write_done() functions. */ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { int attr_cmp = nfs_inode_attrs_cmp(fattr, inode); int status; /* Don't do a WCC update if these attributes are already stale */ if (attr_cmp < 0) return 0; if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) { /* Record the pre/post change info before clearing PRECHANGE */ nfs_ooo_record(NFS_I(inode), fattr); fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE | NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PREMTIME | NFS_ATTR_FATTR_PRECTIME); goto out_noforce; } if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { fattr->pre_change_attr = inode_peek_iversion_raw(inode); fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { fattr->pre_ctime = inode_get_ctime(inode); fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { fattr->pre_mtime = inode_get_mtime(inode); fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_FATTR_PRESIZE; } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_BLOCKS); return status; } /** * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up * weak cache consistency data, if none exist. * * This function is mainly designed to be used by the ->write_done() functions. */ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) { int status; spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state * of the server's inode. * * This is a bit tricky because we have to make sure all dirty pages * have been sent off to the server before calling invalidate_inode_pages. * To make sure no other process adds more write requests while we try * our best to flush them, we make them sleep during the attribute refresh. * * A very similar scenario holds for the dir cache. */ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_isize, new_isize; u64 fattr_supported = server->fattr_valid; unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; bool attr_changed = false; bool have_delegation; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) return 0; /* Has the inode gone and changed behind our back? */ } else if (nfsi->fileid != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && nfsi->fileid == fattr->mounted_on_fileid) return 0; printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, (long long)nfsi->fileid, (long long)fattr->fileid); goto out_err; } /* * Make sure the inode's type hasn't changed. */ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) { /* * Big trouble! The inode has become a different object. */ printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", __func__, inode->i_ino, inode->i_mode, fattr->mode); goto out_err; } /* Update the fsid? */ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && !IS_AUTOMOUNT(inode)) server->fsid = fattr->fsid; /* Save the delegation state before clearing cache_validity */ have_delegation = nfs_have_delegated_attributes(inode); /* * Update the read time so we don't revalidate too often. */ nfsi->read_cache_jiffies = fattr->time_start; /* Fix up any delegated attributes in the struct nfs_fattr */ nfs_fattr_fixup_delegated(inode, fattr); save_cache_validity = nfsi->cache_validity; nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED | NFS_INO_INVALID_BLOCKS); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); if (pnfs_layoutcommit_outstanding(inode)) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS); cache_revalidated = false; } /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 && nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) { /* There is one remaining gap that hasn't been * merged into iversion - do that now. */ inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start); kfree(nfsi->ooo); nfsi->ooo = NULL; } if (!inode_eq_iversion_raw(inode, fattr->change_attr)) { /* Could it be a race with writeback? */ if (!(have_writers || have_delegation)) { invalid |= NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR; /* Force revalidate of all attributes */ save_cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_BTIME; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); attr_changed = true; dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } else if (!have_delegation) { nfs_ooo_record(nfsi, fattr); nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode), fattr->change_attr); } inode_set_iversion_raw(inode, fattr->change_attr); } } else { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CHANGE; if (!have_delegation || (nfsi->cache_validity & NFS_INO_INVALID_CHANGE) != 0) cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MTIME; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_BTIME) nfsi->btime = fattr->btime; else if (fattr_supported & NFS_ATTR_FATTR_BTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BTIME; /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { new_isize = nfs_size_to_loff_t(fattr->size); cur_isize = i_size_read(inode); if (new_isize != cur_isize && !have_delegation) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { trace_nfs_size_update(inode, new_isize); i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_DATA; } } if (new_isize == 0 && !(fattr->valid & (NFS_ATTR_FATTR_SPACE_USED | NFS_ATTR_FATTR_BLOCKS_USED))) { fattr->du.nfs3.used = 0; fattr->valid |= NFS_ATTR_FATTR_SPACE_USED; } } else nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_SIZE; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATIME; if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { umode_t newmode = inode->i_mode & S_IFMT; newmode |= fattr->mode & S_IALLUGO; inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; } } else if (fattr_supported & NFS_ATTR_FATTR_MODE) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MODE; if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) set_nlink(inode, fattr->nlink); } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_NLINK; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; /* Update attrtimeo value if we're out of the unstable period */ if (attr_changed) { nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (cache_revalidated) { if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { nfsi->attrtimeo <<= 1; if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); } nfsi->attrtimeo_timestamp = now; } /* Set the barrier to be more recent than this fattr */ if ((long)(fattr->gencount - nfsi->attr_gencount) > 0) nfsi->attr_gencount = fattr->gencount; } /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); return 0; out_err: /* * No need to worry about unhashing the dentry, as the * lookup validation will know that the inode is bad. * (But we fall through to invalidate the caches.) */ nfs_set_inode_stale_locked(inode); return -ESTALE; } struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; nfsi->ooo = NULL; #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_V4_2 nfsi->xattr_cache = NULL; #endif nfs_netfs_inode_init(nfsi); return &nfsi->vfs_inode; } EXPORT_SYMBOL_GPL(nfs_alloc_inode); void nfs_free_inode(struct inode *inode) { kfree(NFS_I(inode)->ooo); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } EXPORT_SYMBOL_GPL(nfs_free_inode); static inline void nfs4_init_once(struct nfs_inode *nfsi) { #if IS_ENABLED(CONFIG_NFS_V4) INIT_LIST_HEAD(&nfsi->open_states); nfsi->delegation = NULL; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; #endif } static void init_once(void *foo) { struct nfs_inode *nfsi = foo; inode_init_once(&nfsi->vfs_inode); INIT_LIST_HEAD(&nfsi->open_files); INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); nfs4_init_once(nfsi); } static int __init nfs_init_inodecache(void) { nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void nfs_destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(nfs_inode_cachep); } struct workqueue_struct *nfslocaliod_workqueue; struct workqueue_struct *nfsiod_workqueue; EXPORT_SYMBOL_GPL(nfsiod_workqueue); /* * Destroy the nfsiod workqueues */ static void nfsiod_stop(void) { struct workqueue_struct *wq; wq = nfsiod_workqueue; if (wq != NULL) { nfsiod_workqueue = NULL; destroy_workqueue(wq); } #if IS_ENABLED(CONFIG_NFS_LOCALIO) wq = nfslocaliod_workqueue; if (wq != NULL) { nfslocaliod_workqueue = NULL; destroy_workqueue(wq); } #endif /* CONFIG_NFS_LOCALIO */ } /* * Start the nfsiod workqueues */ static int nfsiod_start(void) { dprintk("RPC: creating workqueue nfsiod\n"); nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (nfsiod_workqueue == NULL) return -ENOMEM; #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* * localio writes need to use a normal (non-memreclaim) workqueue. * When we start getting low on space, XFS goes and calls flush_work() on * a non-memreclaim work queue, which causes a priority inversion problem. */ dprintk("RPC: creating workqueue nfslocaliod\n"); nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0); if (unlikely(nfslocaliod_workqueue == NULL)) { nfsiod_stop(); return -ENOMEM; } #endif /* CONFIG_NFS_LOCALIO */ return 0; } unsigned int nfs_net_id; EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); int err; nfs_clients_init(net); if (!rpc_proc_register(net, &nn->rpcstats)) { err = -ENOMEM; goto err_proc_rpc; } err = nfs_fs_proc_net_init(net); if (err) goto err_proc_nfs; return 0; err_proc_nfs: rpc_proc_unregister(net, "nfs"); err_proc_rpc: nfs_clients_exit(net); return err; } static void nfs_net_exit(struct net *net) { rpc_proc_unregister(net, "nfs"); nfs_fs_proc_net_exit(net); nfs_clients_exit(net); } static struct pernet_operations nfs_net_ops = { .init = nfs_net_init, .exit = nfs_net_exit, .id = &nfs_net_id, .size = sizeof(struct nfs_net), }; #ifdef CONFIG_KEYS static struct key *nfs_keyring; static int __init nfs_init_keyring(void) { nfs_keyring = keyring_alloc(".nfs", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), (KEY_POS_ALL & ~KEY_POS_SETATTR) | (KEY_USR_ALL & ~KEY_USR_SETATTR), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); return PTR_ERR_OR_ZERO(nfs_keyring); } static void nfs_exit_keyring(void) { key_put(nfs_keyring); } #else static inline int nfs_init_keyring(void) { return 0; } static inline void nfs_exit_keyring(void) { } #endif /* CONFIG_KEYS */ /* * Initialize NFS */ static int __init init_nfs_fs(void) { int err; err = nfs_init_keyring(); if (err) return err; err = nfs_sysfs_init(); if (err < 0) goto out10; err = register_pernet_subsys(&nfs_net_ops); if (err < 0) goto out9; err = nfsiod_start(); if (err) goto out7; err = nfs_fs_proc_init(); if (err) goto out6; err = nfs_init_nfspagecache(); if (err) goto out5; err = nfs_init_inodecache(); if (err) goto out4; err = nfs_init_readpagecache(); if (err) goto out3; err = nfs_init_writepagecache(); if (err) goto out2; err = nfs_init_directcache(); if (err) goto out1; err = register_nfs_fs(); if (err) goto out0; return 0; out0: nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); out2: nfs_destroy_readpagecache(); out3: nfs_destroy_inodecache(); out4: nfs_destroy_nfspagecache(); out5: nfs_fs_proc_exit(); out6: nfsiod_stop(); out7: unregister_pernet_subsys(&nfs_net_ops); out9: nfs_sysfs_exit(); out10: nfs_exit_keyring(); return err; } static void __exit exit_nfs_fs(void) { nfs_destroy_directcache(); nfs_destroy_writepagecache(); nfs_destroy_readpagecache(); nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); unregister_pernet_subsys(&nfs_net_ops); unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); nfs_sysfs_exit(); nfs_exit_keyring(); } /* Not quite true; I just maintain it */ MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("NFS client support"); MODULE_LICENSE("GPL"); module_param(enable_ino64, bool, 0644); module_init(init_nfs_fs) module_exit(exit_nfs_fs)
11 11 11 11 11 11 11 11 1 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 // SPDX-License-Identifier: GPL-2.0-or-later /* Unbuffered and direct write support. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/uio.h> #include "internal.h" /* * Perform an unbuffered write where we may have to do an RMW operation on an * encrypted file. This can also be used for direct I/O writes. */ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group) { struct netfs_io_request *wreq; unsigned long long start = iocb->ki_pos; unsigned long long end = start + iov_iter_count(iter); ssize_t ret, n; size_t len = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); _enter(""); /* We're going to need a bounce buffer if what we transmit is going to * be different in some way to the source buffer, e.g. because it gets * encrypted/compressed or because it needs expanding to a block size. */ // TODO _debug("uw %llx-%llx", start, end); wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start, iocb->ki_flags & IOCB_DIRECT ? NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); if (IS_ERR(wreq)) return PTR_ERR(wreq); wreq->io_streams[0].avail = true; trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ? netfs_write_trace_dio_write : netfs_write_trace_unbuffered_write)); { /* If this is an async op and we're not using a bounce buffer, * we have to save the source buffer as the iterator is only * good until we return. In such a case, extract an iterator * to represent as much of the the output buffer as we can * manage. Note that the extraction might not be able to * allocate a sufficiently large bvec array and may shorten the * request. */ if (user_backed_iter(iter)) { n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0); if (n < 0) { ret = n; goto out; } wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec; wreq->direct_bv_count = n; wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); } else { /* If this is a kernel-generated async DIO request, * assume that any resources the iterator points to * (eg. a bio_vec array) will persist till the end of * the op. */ wreq->buffer.iter = *iter; } } __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); if (async) __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); /* Copy the data into the bounce buffer and encrypt it. */ // TODO /* Dispatch the write. */ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); if (async) wreq->iocb = iocb; wreq->len = iov_iter_count(&wreq->buffer.iter); ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); if (ret < 0) { _debug("begin = %zd", ret); goto out; } if (!async) { ret = netfs_wait_for_write(wreq); if (ret > 0) iocb->ki_pos += ret; } else { ret = -EIOCBQUEUED; } out: netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; } EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); /** * netfs_unbuffered_write_iter - Unbuffered write to a file * @iocb: IO state structure * @from: iov_iter with data to write * * Do an unbuffered write to a file, writing the data directly to the server * and not lodging the data in the pagecache. * * Return: * * Negative error code if no data has been written at all of * vfs_fsync_range() failed for a synchronous write * * Number of bytes written, even for truncated writes */ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct netfs_inode *ictx = netfs_inode(inode); ssize_t ret; loff_t pos = iocb->ki_pos; unsigned long long end = pos + iov_iter_count(from) - 1; _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode)); if (!iov_iter_count(from)) return 0; trace_netfs_write_iter(iocb, from); netfs_stat(&netfs_n_wh_dio_write); ret = netfs_start_io_direct(inode); if (ret < 0) return ret; ret = generic_write_checks(iocb, from); if (ret <= 0) goto out; ret = file_remove_privs(file); if (ret < 0) goto out; ret = file_update_time(file); if (ret < 0) goto out; if (iocb->ki_flags & IOCB_NOWAIT) { /* We could block if there are any pages in the range. */ ret = -EAGAIN; if (filemap_range_has_page(mapping, pos, end)) if (filemap_invalidate_inode(inode, true, pos, end)) goto out; } else { ret = filemap_write_and_wait_range(mapping, pos, end); if (ret < 0) goto out; } /* * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return * without clobbering -EIOCBQUEUED from ->direct_IO(). */ ret = filemap_invalidate_inode(inode, true, pos, end); if (ret < 0) goto out; end = iocb->ki_pos + iov_iter_count(from); if (end > ictx->zero_point) ictx->zero_point = end; fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), FSCACHE_INVAL_DIO_WRITE); ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL); out: netfs_end_io_direct(inode); return ret; } EXPORT_SYMBOL(netfs_unbuffered_write_iter);
125 97 75 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 // SPDX-License-Identifier: GPL-2.0-or-later /* Null security operations. * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <net/af_rxrpc.h> #include "ar-internal.h" static int none_init_connection_security(struct rxrpc_connection *conn, struct rxrpc_key_token *token) { return 0; } /* * Allocate an appropriately sized buffer for the amount of data remaining. */ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { txb->pkt_len = txb->len; if (txb->len == RXRPC_JUMBO_DATALEN) txb->jumboable = true; return 0; } static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); sp->flags |= RXRPC_RX_VERIFIED; return 0; } static void none_free_call_crypto(struct rxrpc_call *call) { } static bool none_validate_challenge(struct rxrpc_connection *conn, struct sk_buff *skb) { rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_challenge); return true; } static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, struct msghdr *msg) { return -EINVAL; } static int none_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb) { return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_response); } static void none_clear(struct rxrpc_connection *conn) { } static int none_init(void) { return 0; } static void none_exit(void) { } /* * RxRPC Kerberos-based security */ const struct rxrpc_security rxrpc_no_security = { .name = "none", .security_index = RXRPC_SECURITY_NONE, .init = none_init, .exit = none_exit, .init_connection_security = none_init_connection_security, .free_call_crypto = none_free_call_crypto, .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, .validate_challenge = none_validate_challenge, .sendmsg_respond_to_challenge = none_sendmsg_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, };
142 981 854 39 585 581 862 847 727 620 940 940 864 916 31 50 1043 992 19 50 38 42 34 18 48 15 10 15 13 21 21 21 13 21 24 24 24 24 14 24 22 15 4 6 15 8 22 13 5 18 18 11 19 1020 651 865 864 683 477 1021 1025 24 21 24 22 1041 1038 1038 761 923 493 32 948 47 862 863 860 150 4 862 791 72 1068 430 638 931 1068 931 931 573 931 930 573 67 864 862 864 864 864 1069 1069 430 639 849 217 1065 849 217 1066 1066 204 734 129 1068 1105 910 224 1015 1 1015 1013 1015 1015 727 847 645 478 24 476 583 97 581 854 197 854 445 168 39 351 433 413 264 445 353 593 13 583 583 582 59 325 370 48 555 434 382 484 291 344 445 837 133 837 837 527 521 837 849 428 671 849 164 843 813 849 356 813 812 356 354 3 355 3 849 844 741 325 638 495 486 601 460 307 327 486 633 259 300 446 444 445 445 445 1052 1014 79 28 180 20 1 92 161 180 107 46 46 107 836 836 460 668 836 836 5 5 5 3 4 107 780 56 5 4 835 891 47 676 110 57 110 467 302 1025 1025 142 200 1015 165 976 472 635 292 247 218 890 1023 142 1026 1026 943 53 48 278 713 1025 585 441 1025 925 1025 927 97 1025 1026 1026 1026 677 58 866 445 660 844 844 1052 120 677 152 610 889 365 657 1026 872 857 445 1025 1024 122 1023 1521 1522 1522 1520 1502 20 1521 710 32 780 1488 1490 32 32 32 1522 1507 78 224 224 32 1520 1522 1521 1521 1519 1446 75 1521 1522 32 32 192 7 192 145 47 192 192 192 192 192 192 192 104 967 54 966 1 967 968 968 968 849 478 54 966 551 346 231 967 967 849 54 847 846 390 60 60 357 60 56 6 56 56 6 6 56 55 60 1051 687 390 273 99 41 158 142 105 65 17 67 8 23 239 238 358 358 357 355 1 11 248 237 233 1056 1051 234 166 112 3 3 4 47 270 270 102 102 178 1056 654 664 47 178 56 178 178 391 401 401 102 395 102 862 864 138 268 900 502 116 116 116 114 10 264 265 29 162 36 179 164 264 265 264 326 326 326 326 265 250 249 232 74 232 232 232 26 138 138 138 138 318 142 200 200 200 12 200 200 200 200 138 72 200 138 72 200 1542 577 1306 1305 196 196 1308 196 196 196 196 196 1052 236 851 851 654 551 144 401 1057 1055 1056 1057 333 333 333 851 270 10 332 333 502 458 843 458 842 845 401 1057 318 318 318 318 318 318 200 142 142 1090 1090 938 186 48 228 186 4 861 1015 234 1052 112 622 638 54 318 847 1057 895 283 845 48 245 1055 554 42 540 540 62 478 478 477 379 99 99 478 477 477 478 478 601 537 118 600 44 600 535 118 524 73 121 21 533 64 41 43 425 212 211 555 6 6 6 9 14 9 1 9 3 2 2 6 6 6 6 6 5 4 11 11 11 11 19 19 2 2 7 8 1 12 14 6 8 12 1 11 11 28 28 3 21 23 5 5 21 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ /* * mballoc.c contains the multiblocks allocation routines */ #include "ext4_jbd2.h" #include "mballoc.h" #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> #include <linux/freezer.h> #include <trace/events/ext4.h> #include <kunit/static_stub.h> /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() * - search for metadata in few groups * * TODO v4: * - normalization should take into account whether file is still open * - discard preallocations if no free space left (policy?) * - don't normalize tails * - quota * - reservation for superuser * * TODO v3: * - bitmap read-ahead (proposed by Oleg Drokin aka green) * - track min/max extents in each group for better group selection * - mb_mark_used() may allocate chunk right after splitting buddy * - tree of groups sorted by number of free blocks * - error handling */ /* * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * * During initialization phase of the allocator we decide to use the * group preallocation or inode preallocation depending on the size of * the file. The size of the file could be the resulting file size we * would have after allocation, or the current file size, which ever * is larger. If the size is less than sbi->s_mb_stream_request we * select to use the group preallocation. The default value of * s_mb_stream_request is 16 blocks. This can also be tuned via * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in * terms of number of blocks. * * The main motivation for having small file use group preallocation is to * ensure that we have small files closer together on the disk. * * First stage the allocator looks at the inode prealloc list, * ext4_inode_info->i_prealloc_list, which contains list of prealloc * spaces for this particular inode. The inode prealloc space is * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space * pa_len -> length for this prealloc space (in clusters) * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc * space we will consume the particular prealloc space. This makes sure that * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except * pa_free. * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * * The reason for having a per cpu locality group is to reduce the contention * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets * mapped to the buddy and bitmap information regarding different * groups. The buddy information is attached to buddy cache inode so that * we can access them through the page cache. The information regarding * each group is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are stored in the * inode as: * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / * blocksize) blocks. So it can have information regarding groups_per_page * which is blocks_per_page/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. * * We look for count number of blocks in the buddy cache. If we were able * to locate that many free blocks we return with additional information * regarding rest of the contiguous physical block available * * Before allocating blocks via buddy cache we normalize the request * blocks. This ensure we ask for more blocks that we needed. The extra * blocks that we get after allocation is added to the respective prealloc * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * * If "mb_optimize_scan" mount option is set, we maintain in memory group info * structures in two data structures: * * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders) * * Locking: Writers use xa_lock, readers use rcu_read_lock. * * This is an array of xarrays where the index in the array represents the * largest free order in the buddy bitmap of the participating group infos of * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total * number of buddy bitmap orders possible) number of xarrays. Group-infos are * placed in appropriate xarrays. * * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size) * * Locking: Writers use xa_lock, readers use rcu_read_lock. * * This is an array of xarrays where in the i-th xarray there are groups with * average fragment size >= 2^i and < 2^(i+1). The average fragment size * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. * Note that we don't bother with a special xarray for completely empty * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed * in appropriate xarrays. * * In xarray, the index is the block group number, the value is the block group * information, and a non-empty value indicates the block group is present in * the current xarray. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order * >= the order of the request. We directly look at the largest free order list * in the data structure (1) above where largest_free_order = order of the * request. If that list is empty, we look at remaining list in the increasing * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED * lookup in O(1) time. * * At CR_GOAL_LEN_FAST, we only consider groups where * average fragment size > request size. So, we lookup a group which has average * fragment size just above or equal to request size using our average fragment * size group lists (data structure 2) in O(1) time. * * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in * CR_GOAL_LEN_FAST suggests that there is no BG that has avg * fragment size > goal length. So before falling to the slower * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big * enough average fragment size. This increases the chances of finding a * suitable block group in O(1) time and results in faster allocation at the * cost of reduced size of allocation. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and * CR_GOAL_LEN_FAST phase. * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contiguous block in * stripe size. This should result in better allocation on RAID setups. If * not, we search in the specific group using bitmap for best extents. The * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices * this is set to MB_DEFAULT_LINEAR_LIMIT. * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the * subsequent request. */ /* * mballoc operates on the following data: * - on-disk bitmap * - in-core buddy (actually includes buddy and bitmap) * - preallocation descriptors (PAs) * * there are two types of preallocations: * - inode * assiged to specific inode and can be used for this inode only. * it describes part of inode's space preallocated to specific * physical blocks. any block from that preallocated can be used * independent. the descriptor just tracks number of blocks left * unused. so, before taking some block from descriptor, one must * make sure corresponded logical block isn't allocated yet. this * also means that freeing any block within descriptor's range * must discard all preallocated blocks. * - locality group * assigned to specific locality group which does not translate to * permanent set of inodes: inode can join and leave group. space * from this type of preallocation can be used for any inode. thus * it's consumed from the beginning to the end. * * relation between them can be expressed as: * in-core buddy = on-disk bitmap + preallocation descriptors * * this mean blocks mballoc considers used are: * - allocated blocks (persistent) * - preallocated blocks (non-persistent) * * consistency in mballoc world means that at any time a block is either * free or used in ALL structures. notice: "any time" should not be read * literally -- time is discrete and delimited by locks. * * to keep it simple, we don't use block numbers, instead we count number of * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. * * all operations can be expressed as: * - init buddy: buddy = on-disk + PAs * - new PA: buddy += N; PA = N * - use inode PA: on-disk += N; PA -= N * - discard inode PA buddy -= on-disk - PA; PA = 0 * - use locality group PA on-disk += N; PA -= N * - discard locality group PA buddy -= PA; PA = 0 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap * is used in real operation because we can't know actual used * bits from PA, only from on-disk bitmap * * if we follow this strict logic, then all operations above should be atomic. * given some of them can block, we'd have to use something like semaphores * killing performance on high-end SMP hardware. let's try to relax it using * the following knowledge: * 1) if buddy is referenced, it's already initialized * 2) while block is used in buddy and the buddy is referenced, * nobody can re-allocate that block * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has * bit set and PA claims same block, it's OK. IOW, one can set bit in * on-disk bitmap if buddy has same bit set or/and PA covers corresponded * block * * so, now we're building a concurrency table: * - init buddy vs. * - new PA * blocks for PA are allocated in the buddy, buddy must be referenced * until PA is linked to allocation group to avoid concurrent buddy init * - use inode PA * we need to make sure that either on-disk bitmap or PA has uptodate data * given (3) we care that PA-=N operation doesn't interfere with init * - discard inode PA * the simplest way would be to have buddy initialized by the discard * - use locality group PA * again PA-=N must be serialized with init * - discard locality group PA * the simplest way would be to have buddy initialized by the discard * - new PA vs. * - use inode PA * i_data_sem serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * some mutex should serialize them * - discard locality group PA * discard process must wait until PA isn't used by another process * - use inode PA * - use inode PA * i_data_sem or another mutex should serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * nothing wrong here -- they're different PAs covering different blocks * - discard locality group PA * discard process must wait until PA isn't used by another process * * now we're ready to make few consequences: * - PA is referenced and while it is no discard is possible * - PA is referenced until block isn't marked in on-disk bitmap * - PA changes only after on-disk bitmap * - discard must not compete with init. either init is done before * any discard or they're serialized somehow * - buddy init as sum of on-disk bitmap and PAs is done atomically * * a special case when we've used PA to emptiness. no need to modify buddy * in this case, but we should care about concurrent init * */ /* * Logic in few words: * * - allocation: * load group * find blocks * mark bits in on-disk bitmap * release group * * - use preallocation: * find proper PA (per-inode or group) * load group * mark bits in on-disk bitmap * release group * release PA * * - free: * load group * mark bits in on-disk bitmap * release group * * - discard preallocations in group: * mark PAs deleted * move them onto local list * load on-disk bitmap * load group * remove PA from object (inode or locality group) * mark free blocks in-core * * - discard inode's preallocations: */ /* * Locking rules * * Locks: * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) * - cr_power2_aligned lists lock (cr_power2_aligned) * - cr_goal_len_fast lists lock (cr_goal_len_fast) * * Paths: * - new pa * object * group * * - find and use pa: * pa * * - release consumed pa: * pa * group * object * * - generate in-core bitmap: * group * pa * * - discard all for given object (inode, locality group): * object * pa * group * * - discard all for given group: * group * pa * group * object * * - allocation path (ext4_mb_regular_allocator) * group * cr_power2_aligned/cr_goal_len_fast */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for * each unique s_blocksize_bits */ #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" }; static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static int ext4_mb_scan_group(struct ext4_allocation_context *ac, ext4_group_t group); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks); /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block * allocation in ext4_mb_new_blocks(). * 2. We increment this percpu discard_pa_seq counter when we either allocate * or free these blocks i.e. while marking those blocks as used/free in * mb_mark_used()/mb_free_blocks(). * 3. We also increment this percpu seq counter when we successfully identify * that the bb_prealloc_list is not empty and hence proceed for discarding * of those PAs inside ext4_mb_discard_group_preallocations(). * * Now to make sure that the regular fast path of block allocation is not * affected, as a small optimization we only sample the percpu seq counter * on that cpu. Only when the block allocation fails and when freed blocks * found were 0, that is when we sample percpu seq counter for all cpus using * below function ext4_get_discard_pa_seq_sum(). This happens after making * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. */ static DEFINE_PER_CPU(u64, discard_pa_seq); static inline u64 ext4_get_discard_pa_seq_sum(void) { int __cpu; u64 __seq = 0; for_each_possible_cpu(__cpu) __seq += per_cpu(discard_pa_seq, __cpu); return __seq; } static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline int mb_test_bit(int bit, void *addr) { /* * ext4_test_bit on architecture like powerpc * needs unsigned long aligned address */ addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_bit(bit, addr); } static inline void mb_set_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit(bit, addr); } static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } static inline int mb_test_and_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_and_clear_bit(bit, addr); } static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { *max = 0; return NULL; } /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); return e4b->bd_bitmap; } bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; } #ifdef DOUBLE_CHECK static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int i; struct super_block *sb = e4b->bd_sb; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { int i; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; b1 = (unsigned char *) e4b->bd_info->bb_bitmap; b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { ext4_msg(e4b->bd_sb, KERN_ERR, "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc", e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } } } static void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { struct buffer_head *bh; grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); if (!grp->bb_bitmap) return; bh = ext4_read_block_bitmap(sb, group); if (IS_ERR_OR_NULL(bh)) { kfree(grp->bb_bitmap); grp->bb_bitmap = NULL; return; } memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); } static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { kfree(grp->bb_bitmap); } #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { return; } static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { return; } #endif #ifdef AGGRESSIVE_CHECK #define MB_CHECK_ASSERT(assert) \ do { \ if (!(assert)) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: \"%s\"\n", \ function, file, line, # assert); \ BUG(); \ } \ } while (0) static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; int order = e4b->bd_blkbits + 1; int max; int max2; int i; int j; int k; int count; struct ext4_group_info *grp; int fragments = 0; int fstart; struct list_head *cur; void *buddy; void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); MB_CHECK_ASSERT(buddy); buddy2 = mb_find_buddy(e4b, order - 1, &max2); MB_CHECK_ASSERT(buddy2); MB_CHECK_ASSERT(buddy != buddy2); MB_CHECK_ASSERT(max * 2 == max2); count = 0; for (i = 0; i < max; i++) { if (mb_test_bit(i, buddy)) { /* only single bit in buddy2 may be 0 */ if (!mb_test_bit(i << 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit((i<<1)+1, buddy2)); } continue; } /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( !mb_test_bit(k, e4b->bd_bitmap)); } count++; } MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); order--; } fstart = -1; buddy = mb_find_buddy(e4b, 0, &max); for (i = 0; i < max; i++) { if (!mb_test_bit(i, buddy)) { MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); if (fstart == -1) { fragments++; fstart = i; } continue; } fstart = -1; /* check used bits only */ for (j = 0; j < e4b->bd_blkbits + 1; j++) { buddy2 = mb_find_buddy(e4b, j, &max2); k = i >> j; MB_CHECK_ASSERT(k < max2); MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); } } MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ __FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif /* * Divide blocks started from @first with length @len into * smaller chunks with power of 2 blocks. * Clear the bits in bitmap which the blocks of the chunk(s) covered, * then increase bb_counters[] for corresponded chunk size. */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; while (len > 0) { /* find how many blocks can be covered since this position */ max = ffs(first | border) - 1; /* find how many blocks of power 2 we need to mark */ min = fls(len) - 1; if (max < min) min = max; chunk = 1 << min; /* mark multiblock chunks only */ grp->bb_counters[min]++; if (min > 0) mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); len -= chunk; first += chunk; } } static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) { int order; /* * We don't bother with a special lists groups with only 1 block free * extents and for completely empty groups. */ order = fls(len) - 2; if (order < 0) return 0; if (order == MB_NUM_ORDERS(sb)) order--; if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) order = MB_NUM_ORDERS(sb) - 1; return order; } /* Move group to appropriate avg_fragment_size list */ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int new, old; if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) return; old = grp->bb_avg_fragment_size_order; new = grp->bb_fragments == 0 ? -1 : mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); if (new == old) return; if (old >= 0) xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group); grp->bb_avg_fragment_size_order = new; if (new >= 0) { /* * Cannot use __GFP_NOFAIL because we hold the group lock. * Although allocation for insertion may fails, it's not fatal * as we have linear traversal to fall back on. */ int err = xa_insert(&sbi->s_mb_avg_fragment_size[new], grp->bb_group, grp, GFP_ATOMIC); if (err) mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d", grp->bb_group, new, err); } } static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, struct xarray *xa, ext4_group_t start, ext4_group_t end) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); enum criteria cr = ac->ac_criteria; ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned long group = start; struct ext4_group_info *grp; if (WARN_ON_ONCE(end > ngroups || start >= end)) return 0; xa_for_each_range(xa, group, grp, start, end - 1) { int err; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); err = ext4_mb_scan_group(ac, grp->bb_group); if (err || ac->ac_status != AC_STATUS_CONTINUE) return err; cond_resched(); } return 0; } /* * Find a suitable group of given order from the largest free orders xarray. */ static inline int ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac, int order, ext4_group_t start, ext4_group_t end) { struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order]; if (xa_empty(xa)) return 0; return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i; int ret = 0; ext4_group_t start, end; start = group; end = ext4_get_groups_count(ac->ac_sb); wrap_around: for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; } if (start) { end = start; start = 0; goto wrap_around; } if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); /* Increment cr and search again if no group is found */ ac->ac_criteria = CR_GOAL_LEN_FAST; return ret; } /* * Find a suitable group of given order from the average fragments xarray. */ static int ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac, int order, ext4_group_t start, ext4_group_t end) { struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order]; if (xa_empty(xa)) return 0; return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i, ret = 0; ext4_group_t start, end; start = group; end = ext4_get_groups_count(ac->ac_sb); wrap_around: i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i, start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; } if (start) { end = start; start = 0; goto wrap_around; } if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); /* * CR_BEST_AVAIL_LEN works based on the concept that we have * a larger normalized goal len request which can be trimmed to * a smaller goal len such that it can still satisfy original * request len. However, allocation request for non-regular * files never gets normalized. * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). */ if (ac->ac_flags & EXT4_MB_HINT_DATA) ac->ac_criteria = CR_BEST_AVAIL_LEN; else ac->ac_criteria = CR_GOAL_LEN_SLOW; return ret; } /* * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment * order we have and proactively trim the goal request length to that order to * find a suitable group faster. * * This optimizes allocation speed at the cost of slightly reduced * preallocations. However, we make sure that we don't trim the request too * much and fall to CR_GOAL_LEN_SLOW in that case. */ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, ext4_group_t group) { int ret = 0; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i, order, min_order; unsigned long num_stripe_clusters = 0; ext4_group_t start, end; /* * mb_avg_fragment_size_order() returns order in a way that makes * retrieving back the length using (1 << order) inaccurate. Hence, use * fls() instead since we need to know the actual length while modifying * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; if (sbi->s_stripe > 0) { /* * We are assuming that stripe size is always a multiple of * cluster ratio otherwise __ext4_fill_super exists early. */ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); if (1 << min_order < num_stripe_clusters) /* * We consider 1 order less because later we round * up the goal len to num_stripe_clusters */ min_order = fls(num_stripe_clusters) - 1; } if (1 << min_order < ac->ac_o_ex.fe_len) min_order = fls(ac->ac_o_ex.fe_len); start = group; end = ext4_get_groups_count(ac->ac_sb); wrap_around: for (i = order; i >= min_order; i--) { int frag_order; /* * Scale down goal len to make sure we find something * in the free fragments list. Basically, reduce * preallocations. */ ac->ac_g_ex.fe_len = 1 << i; if (num_stripe_clusters > 0) { /* * Try to round up the adjusted goal length to * stripe size (in cluster units) multiple for * efficiency. */ ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, num_stripe_clusters); } frag_order = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order, start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; } if (start) { end = start; start = 0; goto wrap_around; } /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); ac->ac_criteria = CR_GOAL_LEN_SLOW; return ret; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; return 1; } /* * next linear group for allocation. */ static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups) { /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ *group = *group + 1 >= ngroups ? 0 : *group + 1; } static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac, ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count) { int ret, i; enum criteria cr = ac->ac_criteria; struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group = *start; for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) { ret = ext4_mb_scan_group(ac, group); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; cond_resched(); } *start = group; if (count == ngroups) ac->ac_criteria++; /* Processed all groups and haven't found blocks */ if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); return 0; } static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) { int ret = 0; ext4_group_t start; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); /* non-extent files are limited to low blocks/groups */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ngroups = sbi->s_blockfile_groups; /* searching for the right group start from the goal value specified */ start = ac->ac_g_ex.fe_group; ac->ac_prefetch_grp = start; ac->ac_prefetch_nr = 0; if (!should_optimize_scan(ac)) return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups); /* * Optimized scanning can return non adjacent groups which can cause * seek overhead for rotational disks. So try few linear groups before * trying optimized scan. */ if (sbi->s_mb_max_linear_groups) ret = ext4_mb_scan_groups_linear(ac, ngroups, &start, sbi->s_mb_max_linear_groups); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; switch (ac->ac_criteria) { case CR_POWER2_ALIGNED: return ext4_mb_scan_groups_p2_aligned(ac, start); case CR_GOAL_LEN_FAST: return ext4_mb_scan_groups_goal_fast(ac, start); case CR_BEST_AVAIL_LEN: return ext4_mb_scan_groups_best_avail(ac, start); default: /* * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an * rb tree sorted by bb_free. But until that happens, we should * never come here. */ WARN_ON(1); } return 0; } /* * Cache the order of the largest free extent we have available in this block * group. */ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int new, old = grp->bb_largest_free_order; for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) if (grp->bb_counters[new] > 0) break; /* No need to move between order lists? */ if (new == old) return; if (old >= 0) { struct xarray *xa = &sbi->s_mb_largest_free_orders[old]; if (!xa_empty(xa) && xa_load(xa, grp->bb_group)) xa_erase(xa, grp->bb_group); } grp->bb_largest_free_order = new; if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { /* * Cannot use __GFP_NOFAIL because we hold the group lock. * Although allocation for insertion may fails, it's not fatal * as we have linear traversal to fall back on. */ int err = xa_insert(&sbi->s_mb_largest_free_orders[new], grp->bb_group, grp, GFP_ATOMIC); if (err) mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d", grp->bb_group, new, err); } } static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); /* initialize buddy from bitmap which is aggregation * of on-disk bitmap and preallocations */ i = mb_find_next_zero_bit(bitmap, max, 0); grp->bb_first_free = i; while (i < max) { fragments++; first = i; i = mb_find_next_bit(bitmap, max, i); len = i - first; free += len; if (len > 1) ext4_mb_mark_free_simple(sb, buddy, first, len, grp); else grp->bb_counters[0]++; if (i < max) i = mb_find_next_zero_bit(bitmap, max, i); } grp->bb_fragments = fragments; if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, "block bitmap and bg descriptor " "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); mb_update_avg_fragment_size(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; atomic_inc(&sbi->s_mb_buddies_generated); atomic64_add(period, &sbi->s_mb_generation_time); } static void mb_regenerate_buddy(struct ext4_buddy *e4b) { int count; int order = 1; void *buddy; while ((buddy = mb_find_buddy(e4b, order++, &count))) mb_set_bits(buddy, 0, count); e4b->bd_info->bb_fragments = 0; memset(e4b->bd_info->bb_counters, 0, sizeof(*e4b->bd_info->bb_counters) * (e4b->bd_sb->s_blocksize_bits + 2)); ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); } /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are * stored in the inode as * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. * So for each group we take up 2 blocks. A page can * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 * * Locking note: This routine takes the block group lock of all groups * for this page; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; int blocks_per_page; int groups_per_page; int err = 0; int i; ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; struct ext4_group_info *grinfo; inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; mb_debug(sb, "init folio %lu\n", folio->index); groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) groups_per_page = 1; /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM; } else bh = &bhs; first_group = folio->index * blocks_per_page / 2; /* read all groups the folio covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break; grinfo = ext4_get_group_info(sb, group); if (!grinfo) continue; /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ if (folio_test_uptodate(folio) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); if (IS_ERR(bh[i])) { err = PTR_ERR(bh[i]); bh[i] = NULL; goto out; } mb_debug(sb, "read bitmap for group %u\n", group); } /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { int err2; if (!bh[i]) continue; err2 = ext4_wait_block_bitmap(sb, group, bh[i]); if (!err) err = err2; } first_block = folio->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; if (!bh[group - first_group]) /* skip initialized uptodate buddy */ continue; if (!buffer_verified(bh[group - first_group])) /* Skip faulty bitmaps */ continue; err = 0; /* * data carry information regarding this * particular group in the format specified * above * */ data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* * We place the buddy block and bitmap block * close together */ grinfo = ext4_get_group_info(sb, group); if (!grinfo) { err = -EFSCORRUPTED; goto out; } if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * (MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); /* init the buddy */ memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ BUG_ON(incore != NULL); mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); memcpy(data, bitmap, blocksize); /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be * generated using this */ incore = data; } } folio_mark_uptodate(folio); out: if (bh) { for (i = 0; i < groups_per_page; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); } return err; } /* * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; struct folio *folio; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ return 0; } /* blocks_per_page == 1, hence we need another page for the buddy */ folio = __filemap_get_folio(inode->i_mapping, block + 1, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); e4b->bd_buddy_folio = folio; return 0; } static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) { folio_unlock(e4b->bd_bitmap_folio); folio_put(e4b->bd_bitmap_folio); } if (e4b->bd_buddy_folio) { folio_unlock(e4b->bd_buddy_folio); folio_put(e4b->bd_buddy_folio); } } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; struct ext4_buddy e4b; struct folio *folio; int ret = 0; might_sleep(); mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); if (!this_grp) return -EFSCORRUPTED; /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed. */ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ goto err; } folio = e4b.bd_bitmap_folio; ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy */ ret = 0; goto err; } /* init buddy cache */ folio = e4b.bd_buddy_folio; ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } err: ext4_mb_put_buddy_page_lock(&e4b); return ret; } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { int blocks_per_page; int block; int pnum; int poff; struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; might_sleep(); mb_debug(sb, "load group %u\n", group); blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); if (!grp) return -EFSCORRUPTED; e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* * we need full data about the group * to make a good selection */ ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) /* * drop the folio reference and try * to get the folio with lock. If we * are not uptodate that implies * somebody just created the folio but * is yet to initialize it. So * wait for it to initialize. */ folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { folio_unlock(folio); goto err; } mb_cmp_bitmaps(e4b, folio_address(folio) + (poff * sb->s_blocksize)); } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } /* Folios marked accessed already */ e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { folio_unlock(folio); goto err; } } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } /* Folios marked accessed already */ e4b->bd_buddy_folio = folio; e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); return 0; err: if (!IS_ERR_OR_NULL(folio)) folio_put(folio); if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; } static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); } static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); if (e4b->bd_buddy_folio) folio_put(e4b->bd_buddy_folio); } static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1, max; void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); while (order <= e4b->bd_blkbits + 1) { bb = mb_find_buddy(e4b, order, &max); if (!mb_test_bit(block >> order, bb)) { /* this block is part of buddy of order 'order' */ return order; } order++; } return 0; } static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); *addr = 0; cur += 32; continue; } mb_clear_bit(cur, bm); cur++; } } /* clear bits in given range * will return first found zero bit if any, -1 otherwise */ static int mb_test_and_clear_bits(void *bm, int cur, int len) { __u32 *addr; int zero_bit = -1; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); if (*addr != (__u32)(-1) && zero_bit == -1) zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); *addr = 0; cur += 32; continue; } if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) zero_bit = cur; cur++; } return zero_bit; } void mb_set_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: set whole word at once */ addr = bm + (cur >> 3); *addr = 0xffffffff; cur += 32; continue; } mb_set_bit(cur, bm); cur++; } } static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { if (mb_test_bit(*bit + side, bitmap)) { mb_clear_bit(*bit, bitmap); (*bit) -= side; return 1; } else { (*bit) += side; mb_set_bit(*bit, bitmap); return -1; } } static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) { int max; int order = 1; void *buddy = mb_find_buddy(e4b, order, &max); while (buddy) { void *buddy2; /* Bits in range [first; last] are known to be set since * corresponding blocks were allocated. Bits in range * (first; last) will stay set because they form buddies on * upper layer. We just deal with borders if they don't * align with upper layer and then go up. * Releasing entire group is all about clearing * single bit of highest order buddy. */ /* Example: * --------------------------------- * | 1 | 1 | 1 | 1 | * --------------------------------- * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | * --------------------------------- * 0 1 2 3 4 5 6 7 * \_____________________/ * * Neither [1] nor [6] is aligned to above layer. * Left neighbour [0] is free, so mark it busy, * decrease bb_counters and extend range to * [0; 6] * Right neighbour [7] is busy. It can't be coaleasced with [6], so * mark [6] free, increase bb_counters and shrink range to * [0; 5]. * Then shift range to [0; 2], go up and do the same. */ if (first & 1) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); if (!(last & 1)) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); if (first > last) break; order++; buddy2 = mb_find_buddy(e4b, order, &max); if (!buddy2) { mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; break; } first >>= 1; last >>= 1; buddy = buddy2; } } static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int left_is_free = 0; int right_is_free = 0; int block; int last = first + count - 1; struct super_block *sb = e4b->bd_sb; if (WARN_ON(count == 0)) return; BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return; mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ if (first != 0) left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; /* * Fastcommit replay can free already freed blocks which * corrupts allocation info. Regenerate it. */ if (sbi->s_mount_state & EXT4_FC_REPLAY) { mb_regenerate_buddy(e4b); goto check; } blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing already freed block (bit %u); block bitmap corrupt.", block); return; } this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; /* buddy[0] == bd_bitmap is a special case, so handle * it right away and let mb_buddy_mark_free stay free of * zero order checks. * Check if neighbours are to be coaleasced, * adjust bitmap bb_counters and borders appropriately. */ if (first & 1) { first += !left_is_free; e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; } if (!(last & 1)) { last -= !right_is_free; e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; } if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); check: mb_check_buddy(e4b); } static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int max, order, next; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; return 0; } /* find actual order */ order = mb_find_order_for_block(e4b, block); ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); ex->fe_start = block; ex->fe_group = e4b->bd_group; block = block >> order; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; next = (block + 1) * (1 << order); if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); block = next >> order; ex->fe_len += 1 << order; } if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */ WARN_ON(1); ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, "corruption or bug in mb_find_extent " "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", block, order, needed, ex->fe_group, ex->fe_start, ex->fe_len, ex->fe_logical); ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; } return ex->fe_len; } static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) { int ord; int mlen = 0; int max = 0; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; /* let's maintain fragments counter */ if (start != 0) mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) e4b->bd_info->bb_fragments--; /* let's maintain buddy itself */ while (len) { ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; start += mlen; len -= mlen; BUG_ON(len < 0); continue; } /* store for history */ if (ret == 0) ret = len | (ord << 16); BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; ord_start = (start >> ord) << ord; ord_end = ord_start + (1 << ord); /* first chunk */ if (start > ord_start) ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, ord_start, start - ord_start, e4b->bd_info); /* last chunk */ if (start + len < ord_end) { ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, start + len, ord_end - (start + len), e4b->bd_info); break; } len = start + len - ord_end; start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; } /* * Must be called under group lock! */ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int ret; BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); BUG_ON(ac->ac_status == AC_STATUS_FOUND); ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; ret = mb_mark_used(e4b, &ac->ac_b_ex); /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ ac->ac_f_ex = ac->ac_b_ex; ac->ac_status = AC_STATUS_FOUND; ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; /* * take the page reference. We want the page to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ ac->ac_bitmap_folio = e4b->bd_bitmap_folio; folio_get(ac->ac_bitmap_folio); ac->ac_buddy_folio = e4b->bd_buddy_folio; folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); } /* * As we've just preallocated more space than * user requested originally, we store allocated * space in a special descriptor. */ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) ext4_mb_new_preallocation(ac); } static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_buddy *e4b, int finish_group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; if (ac->ac_status == AC_STATUS_FOUND) return; /* * We don't want to scan for a whole year */ if (ac->ac_found > sbi->s_mb_max_to_scan && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { ac->ac_status = AC_STATUS_BREAK; return; } /* * Haven't found good chunk so far, let's continue */ if (bex->fe_len < gex->fe_len) return; if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan) ext4_mb_use_best_found(ac, e4b); } /* * The routine checks whether found extent is good enough. If it is, * then the extent gets marked used and flag is set to the context * to stop scanning. Otherwise, the extent is compared with the * previous found extent and if new one is better, then it's stored * in the context. Later, the best found extent will be used, if * mballoc can't find good enough extent. * * The algorithm used is roughly as follows: * * * If free extent found is exactly as big as goal, then * stop the scan and use it immediately * * * If free extent found is smaller than goal, then keep retrying * upto a max of sbi->s_mb_max_to_scan times (default 200). After * that stop scanning and use whatever we have. * * * If free extent found is bigger than goal, then keep retrying * upto a max of sbi->s_mb_min_to_scan times (default 10) before * stopping the scan and using the extent. * * * FIXME: real allocation policy is to be designed yet! */ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *ex, struct ext4_buddy *e4b) { struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; /* * The special case - take what you catch first */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * Let's check whether the chuck is good enough */ if (ex->fe_len == gex->fe_len) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * If this is first found extent, just store it in the context */ if (bex->fe_len == 0) { *bex = *ex; return; } /* * If new found extent is better, store it in the context */ if (bex->fe_len < gex->fe_len) { /* if the request isn't satisfied, any found extent * larger than previous best one is better */ if (ex->fe_len > bex->fe_len) *bex = *ex; } else if (ex->fe_len > gex->fe_len) { /* if the request is satisfied, then we try to find * an extent that still satisfy the request, but is * smaller than previous one */ if (ex->fe_len < bex->fe_len) *bex = *ex; } ext4_mb_check_limits(ac, e4b, 0); } static noinline_for_stack void ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; ext4_group_t group = ex.fe_group; int max; int err; BUG_ON(ex.fe_len <= 0); err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } static noinline_for_stack int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { ext4_group_t group = ac->ac_g_ex.fe_group; int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!grp) return -EFSCORRUPTED; if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) return 0; if (grp->bb_free == 0) return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } } else if (max >= ac->ac_g_ex.fe_len) { BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { /* Sometimes, caller may want to merge even small * number of blocks to an existing extent */ BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); return 0; } /* * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ static noinline_for_stack void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_group_info *grp = e4b->bd_info; void *buddy; int i; int k; int max; BUG_ON(ac->ac_2order <= 0); for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { if (grp->bb_counters[i] == 0) continue; buddy = mb_find_buddy(e4b, i, &max); if (WARN_RATELIMIT(buddy == NULL, "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) continue; k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { ext4_mark_group_bitmap_corrupted(ac->ac_sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); break; } ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; ac->ac_b_ex.fe_group = e4b->bd_group; ext4_mb_use_best_found(ac, e4b); BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); break; } } /* * The routine scans the group and measures all found extents. * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ static noinline_for_stack void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i, j, freelen; int free; free = e4b->bd_info->bb_free; if (WARN_ON(free <= 0)) return; i = e4b->bd_info->bb_first_free; while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * have free blocks */ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); break; } if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough * continuous free extent, so skip over the smaller free * extents */ j = mb_find_next_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); freelen = j - i; if (freelen < ac->ac_g_ex.fe_len) { i = j; free -= freelen; continue; } } mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit * without claiming the space. */ break; } ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; free -= ex.fe_len; } ext4_mb_check_limits(ac, e4b, 1); } /* * This is a special case for storages like raid5 * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; ext4_grpblk_t i, stripe; int max; BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, i, stripe, &ex); if (max >= stripe) { ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; } } i += stripe; } } static void __ext4_mb_scan_group(struct ext4_allocation_context *ac) { bool is_stripe_aligned; struct ext4_sb_info *sbi; enum criteria cr = ac->ac_criteria; ac->ac_groups_scanned++; if (cr == CR_POWER2_ALIGNED) return ext4_mb_simple_scan_group(ac, ac->ac_e4b); sbi = EXT4_SB(ac->ac_sb); is_stripe_aligned = false; if ((sbi->s_stripe >= sbi->s_cluster_ratio) && !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe))) is_stripe_aligned = true; if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && is_stripe_aligned) ext4_mb_scan_aligned(ac, ac->ac_e4b); if (ac->ac_status == AC_STATUS_CONTINUE) ext4_mb_complex_scan_group(ac, ac->ac_e4b); } /* * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable * for the allocation or not. */ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; if (free == 0) return false; fragments = grp->bb_fragments; if (fragments == 0) return false; switch (cr) { case CR_POWER2_ALIGNED: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return false; if (free < ac->ac_g_ex.fe_len) return false; if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) return true; if (grp->bb_largest_free_order < ac->ac_2order) return false; return true; case CR_GOAL_LEN_FAST: case CR_BEST_AVAIL_LEN: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; case CR_GOAL_LEN_SLOW: if (free >= ac->ac_g_ex.fe_len) return true; break; case CR_ANY_FREE: return true; default: BUG(); } return false; } /* * This could return negative error code if something goes wrong * during ext4_mb_init_group(). This should not be called with * ext4_lock_group() held. * * Note: because we are conditionally operating with the group lock in * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this * function using __acquire and __release. This means we need to be * super careful before messing with the error path handling via "goto * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; ext4_grpblk_t free; int ret = 0; if (!grp) return -EFSCORRUPTED; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } free = grp->bb_free; if (free == 0) goto out; /* * In all criterias except CR_ANY_FREE we try to avoid groups that * can't possibly satisfy the full goal request due to insufficient * free blocks. */ if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; /* * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this * gets used for metadata block allocation, and we want to make * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) return 0; ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) return ret; } if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } ret = ext4_mb_good_group(ac, group, cr); out: if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } return ret; } /* * Start prefetching @nr block bitmaps starting at @group. * Return the next group which needs to be prefetched. */ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, unsigned int nr, int *cnt) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct buffer_head *bh; struct blk_plug plug; blk_start_plug(&plug); while (nr-- > 0) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); struct ext4_group_info *grp = ext4_get_group_info(sb, group); /* * Prefetch block groups with free blocks; but don't * bother if it is marked uninitialized on disk, since * it won't require I/O to read. Also only try to * prefetch once, so we avoid getblk() call, which can * be expensive. */ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 ) { bh = ext4_read_block_bitmap_nowait(sb, group, true); if (bh && !IS_ERR(bh)) { if (!buffer_uptodate(bh) && cnt) (*cnt)++; brelse(bh); } } if (++group >= ngroups) group = 0; } blk_finish_plug(&plug); return group; } /* * Batch reads of the block allocation bitmaps to get * multiple READs in flight; limit prefetching at inexpensive * CR, otherwise mballoc can spend a lot of time loading * imperfect groups */ static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac, ext4_group_t group) { struct ext4_sb_info *sbi; if (ac->ac_prefetch_grp != group) return; sbi = EXT4_SB(ac->ac_sb); if (ext4_mb_cr_expensive(ac->ac_criteria) || ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) { unsigned int nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(ac->ac_sb)) { nr = 1 << sbi->s_log_groups_per_flex; nr -= group & (nr - 1); nr = umin(nr, sbi->s_mb_prefetch); } ac->ac_prefetch_nr = nr; ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr, &ac->ac_prefetch_ios); } } /* * Prefetching reads the block bitmap into the buffer cache; but we * need to make sure that the buddy bitmap in the page cache has been * initialized. Note that ext4_mb_init_group() will block if the I/O * is not yet completed, or indeed if it was not initiated by * ext4_mb_prefetch did not start the I/O. * * TODO: We should actually kick off the buddy bitmap setup in a work * queue when the buffer I/O is completed, so that we don't block * waiting for the block allocation bitmap read to finish when * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). */ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr) { struct ext4_group_desc *gdp; struct ext4_group_info *grp; while (nr-- > 0) { if (!group) group = ext4_get_groups_count(sb); group--; gdp = ext4_get_group_desc(sb, group, NULL); grp = ext4_get_group_info(sb, group); if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0) { if (ext4_mb_init_group(sb, group, GFP_NOFS)) break; } } } static int ext4_mb_scan_group(struct ext4_allocation_context *ac, ext4_group_t group) { int ret; struct super_block *sb = ac->ac_sb; enum criteria cr = ac->ac_criteria; ext4_mb_might_prefetch(ac, group); /* prevent unnecessary buddy loading. */ if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) return 0; /* This now checks without needing the buddy page */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!ac->ac_first_err) ac->ac_first_err = ret; return 0; } ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b); if (ret) return ret; /* skip busy group */ if (cr >= CR_ANY_FREE) ext4_lock_group(sb, group); else if (!ext4_try_lock_group(sb, group)) goto out_unload; /* We need to check again after locking the block group. */ if (unlikely(!ext4_mb_good_group(ac, group, cr))) goto out_unlock; __ext4_mb_scan_group(ac); out_unlock: ext4_unlock_group(sb, group); out_unload: ext4_mb_unload_buddy(ac->ac_e4b); return ret; } static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t i; int err = 0; struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ err = ext4_mb_find_by_goal(ac, &e4b); if (err || ac->ac_status == AC_STATUS_FOUND) goto out; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) goto out; /* * ac->ac_2order is set only if the fe_len is a power of 2 * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); ac->ac_g_ex.fe_start = -1; ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL; } /* * Let's just scan groups to find more-less suitable blocks We * start with CR_GOAL_LEN_FAST, unless it is power of 2 * aligned, in which case let's do that faster approach first. */ ac->ac_criteria = CR_GOAL_LEN_FAST; if (ac->ac_2order) ac->ac_criteria = CR_POWER2_ALIGNED; ac->ac_e4b = &e4b; ac->ac_prefetch_ios = 0; ac->ac_first_err = 0; repeat: while (ac->ac_criteria < EXT4_MB_NUM_CRS) { err = ext4_mb_scan_groups(ac); if (err) goto out; if (ac->ac_status != AC_STATUS_CONTINUE) break; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { /* * We've been searching too long. Let's try to allocate * the best chunk we've found so far */ ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { int lost; /* * Someone more lucky has already allocated it. * The only thing we can do is just take first * found block(s) */ lost = atomic_inc_return(&sbi->s_mb_lost_chunks); mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, lost); ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; ac->ac_criteria = CR_ANY_FREE; goto repeat; } } if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) { atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); if (ac->ac_flags & EXT4_MB_STREAM_ALLOC && ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group) atomic_inc(&sbi->s_bal_stream_goals); } out: if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err) err = ac->ac_first_err; mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, ac->ac_flags, ac->ac_criteria, err); if (ac->ac_prefetch_nr) ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr); return err; } static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; ++*pos; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i, err; char nbuf[16]; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, EXT4_MAX_BLOCK_LOG_SIZE + 2); group--; if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); if (!grinfo) return 0; /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); return 0; } ext4_mb_unload_buddy(&e4b); } /* * We care only about free space counters in the group info and * these are safe to access even after the buddy has been unloaded */ memcpy(sg, grinfo, i); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, sg->bb_fragments, sg->bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg->bb_counters[i] : 0); seq_puts(seq, " ]"); if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) seq_puts(seq, " Block bitmap corrupted!"); seq_putc(seq, '\n'); return 0; } static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, .show = ext4_mb_seq_groups_show, }; int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct ext4_sb_info *sbi = EXT4_SB(sb); seq_puts(seq, "mballoc:\n"); if (!sbi->s_mb_stats) { seq_puts(seq, "\tmb stats collection turned off.\n"); seq_puts( seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); return 0; } seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); /* CR_POWER2_ALIGNED stats */ seq_puts(seq, "\tcr_p2_aligned_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); /* CR_GOAL_LEN_FAST stats */ seq_puts(seq, "\tcr_goal_fast_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); /* CR_BEST_AVAIL_LEN stats */ seq_puts(seq, "\tcr_best_avail_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); /* CR_GOAL_LEN_SLOW stats */ seq_puts(seq, "\tcr_goal_slow_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW])); /* CR_ANY_FREE stats */ seq_puts(seq, "\tcr_any_free_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE])); /* Aggregates */ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); seq_printf(seq, "\t\tstream_goal_hits: %u\n", atomic_read(&sbi->s_bal_stream_goals)); seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); seq_printf(seq, "\tbuddies_generated: %u/%u\n", atomic_read(&sbi->s_mb_buddies_generated), ext4_get_groups_count(sb)); seq_printf(seq, "\tbuddies_time_used: %llu\n", atomic64_read(&sbi->s_mb_generation_time)); seq_printf(seq, "\tpreallocated: %u\n", atomic_read(&sbi->s_mb_preallocated)); seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded)); return 0; } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); } static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; ++*pos; if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); } static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; unsigned int count; unsigned long idx; position--; if (position >= MB_NUM_ORDERS(sb)) { position -= MB_NUM_ORDERS(sb); if (position == 0) seq_puts(seq, "avg_fragment_size_lists:\n"); count = 0; xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp) count++; seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; } if (position == 0) { seq_printf(seq, "optimize_scan: %d\n", test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); seq_puts(seq, "max_free_order_lists:\n"); } count = 0; xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp) count++; seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) { } const struct seq_operations ext4_mb_seq_structs_summary_ops = { .start = ext4_mb_seq_structs_summary_start, .next = ext4_mb_seq_structs_summary_next, .stop = ext4_mb_seq_structs_summary_stop, .show = ext4_mb_seq_structs_summary_show, }; static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; BUG_ON(!cachep); return cachep; } /* * Allocate the top-level s_group_info array for the specified number * of groups */ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned size; struct ext4_group_info ***old_groupinfo, ***new_groupinfo; size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); if (size <= sbi->s_group_info_size) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } rcu_read_lock(); old_groupinfo = rcu_dereference(sbi->s_group_info); if (old_groupinfo) memcpy(new_groupinfo, old_groupinfo, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); rcu_read_unlock(); rcu_assign_pointer(sbi->s_group_info, new_groupinfo); sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); if (old_groupinfo) ext4_kvfree_array_rcu(old_groupinfo); ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { int i; int metalen = 0; int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); /* * First check if this group is the first of a reserved block. * If it's true, we have to allocate a new table of pointers * to ext4_group_info structures */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_NOFS); if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); return -ENOMEM; } rcu_read_lock(); rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; rcu_read_unlock(); } meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); /* * initialize bb_free to be able to skip * empty groups without initialization */ if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { meta_group_info[i]->bb_free = ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; exit_group_info: /* If a meta_group_info table has been allocated, release it now */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { struct ext4_group_info ***group_info; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); kfree(group_info[idx]); group_info[idx] = NULL; rcu_read_unlock(); } return -ENOMEM; } /* ext4_mb_add_groupinfo */ static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; struct ext4_group_desc *desc; struct ext4_group_info ***group_info; struct kmem_cache *cachep; err = ext4_mb_alloc_groupinfo(sb, ngroups); if (err) return err; sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } /* To avoid potentially colliding with an valid on-disk inode number, * use EXT4_BAD_INO for the buddy cache inode number. This inode is * not in the inode hash, so it should never be found by iget(), but * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) goto err_freebuddy; } if (ext4_has_feature_flex_bg(sb)) { /* a single flex group is supposed to be read by a single IO. * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is * unsigned integer, so the maximum shift is 32. */ if (sbi->s_es->s_log_groups_per_flex >= 32) { ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); goto err_freebuddy; } sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ } else { sbi->s_mb_prefetch = 32; } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); /* * now many real IOs to prefetch within a single allocation at * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related * optimization we shouldn't try to load too many groups, at some point * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); return 0; err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); if (grp) kmem_cache_free(cachep, grp); } i = sbi->s_group_info_size; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); while (i-- > 0) kfree(group_info[i]); rcu_read_unlock(); iput(sbi->s_buddy_cache); err_freesgi: rcu_read_lock(); kvfree(rcu_dereference(sbi->s_group_info)); rcu_read_unlock(); return -ENOMEM; } static void ext4_groupinfo_destroy_slabs(void) { int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } static int ext4_groupinfo_create_slab(size_t size) { static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); int slab_size; int blocksize_bits = order_base_2(size); int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep; if (cache_index >= NR_GRPINFO_CACHES) return -EINVAL; if (unlikely(cache_index < 0)) cache_index = 0; mutex_lock(&ext4_grpinfo_slab_create_mutex); if (ext4_groupinfo_caches[cache_index]) { mutex_unlock(&ext4_grpinfo_slab_create_mutex); return 0; /* Already created */ } slab_size = offsetof(struct ext4_group_info, bb_counters[blocksize_bits + 2]); cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); ext4_groupinfo_caches[cache_index] = cachep; mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { printk(KERN_EMERG "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } return 0; } static void ext4_discard_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_discard_work); struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); load_grp = UINT_MAX; list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { /* * If filesystem is umounting or no memory or suffering * from no space, give up the discard */ if ((sb->s_flags & SB_ACTIVE) && !err && !atomic_read(&sbi->s_retry_alloc_pending)) { grp = fd->efd_group; if (grp != load_grp) { if (load_grp != UINT_MAX) ext4_mb_unload_buddy(&e4b); err = ext4_mb_load_buddy(sb, grp, &e4b); if (err) { kmem_cache_free(ext4_free_data_cachep, fd); load_grp = UINT_MAX; continue; } else { load_grp = grp; } } ext4_lock_group(sb, grp); ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, fd->efd_start_cluster + fd->efd_count - 1, 1); ext4_unlock_group(sb, grp); } kmem_cache_free(ext4_free_data_cachep, fd); } if (load_grp != UINT_MAX) ext4_mb_unload_buddy(&e4b); } static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) { for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_avg_fragment_size[i]); kfree(sbi->s_mb_avg_fragment_size); } static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) { for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_largest_free_orders[i]); kfree(sbi->s_mb_largest_free_orders); } int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; unsigned offset, offset_incr; unsigned max; int ret; i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { ret = -ENOMEM; goto out; } i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; goto out; } ret = ext4_groupinfo_create_slab(sb->s_blocksize); if (ret < 0) goto out; /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; sbi->s_mb_offsets[0] = 0; i = 1; offset = 0; offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; offset += offset_incr; offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i < MB_NUM_ORDERS(sb)); sbi->s_mb_avg_fragment_size = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_avg_fragment_size) { ret = -ENOMEM; goto out; } for (i = 0; i < MB_NUM_ORDERS(sb); i++) xa_init(&sbi->s_mb_avg_fragment_size[i]); sbi->s_mb_largest_free_orders = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_largest_free_orders) { ret = -ENOMEM; goto out; } for (i = 0; i < MB_NUM_ORDERS(sb); i++) xa_init(&sbi->s_mb_largest_free_orders[i]); spin_lock_init(&sbi->s_md_lock); atomic_set(&sbi->s_mb_free_pending, 0); INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); INIT_WORK(&sbi->s_discard_work, ext4_discard_work); atomic_set(&sbi->s_retry_alloc_pending, 0); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file * systems, this is probably too big (i.e, if the cluster size * is 1 megabyte, then group preallocation size becomes half a * gigabyte!). As a default, we will keep a two megabyte * group pralloc size for cluster sizes up to 64k, and after * that, we will force a minimum group preallocation size of * 32 clusters. This translates to 8 megs when the cluster * size is 256k, and 32 megs when the cluster size is 1 meg, * which seems reasonable as a default. */ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> sbi->s_cluster_bits, 32); /* * If there is a s_stripe > 1, then we set the s_mb_group_prealloc * to the lowest multiple of s_stripe which is bigger than * the s_mb_group_prealloc as determined above. We want * the preallocation size to be an exact multiple of the * RAID stripe size so that preallocations don't fragment * the stripes. */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); } sbi->s_mb_nr_global_goals = umin(num_possible_cpus(), DIV_ROUND_UP(sbi->s_groups_count, 4)); sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals, sizeof(ext4_group_t), GFP_KERNEL); if (sbi->s_mb_last_groups == NULL) { ret = -ENOMEM; goto out; } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; goto out_free_last_groups; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; lg = per_cpu_ptr(sbi->s_locality_groups, i); mutex_init(&lg->lg_mutex); for (j = 0; j < PREALLOC_TB_SIZE; j++) INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); spin_lock_init(&lg->lg_prealloc_lock); } if (bdev_nonrot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) goto out_free_locality_groups; return 0; out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out_free_last_groups: kfree(sbi->s_mb_last_groups); sbi->s_mb_last_groups = NULL; out: ext4_mb_avg_fragment_size_destroy(sbi); ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); sbi->s_mb_maxs = NULL; return ret; } /* need to called with the ext4 group lock held */ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; struct list_head *cur, *tmp; int count = 0; list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; kmem_cache_free(ext4_pspace_cachep, pa); } return count; } void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo, ***group_info; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); int count; if (test_opt(sb, DISCARD)) { /* * wait the discard work to drain all of ext4_free_data */ flush_work(&sbi->s_discard_work); WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); } if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); if (!grinfo) continue; mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); count = ext4_mb_cleanup_pa(grinfo); if (count) mb_debug(sb, "mballoc: %d PAs left\n", count); ext4_unlock_group(sb, i); kmem_cache_free(cachep, grinfo); } num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); for (i = 0; i < num_meta_group_infos; i++) kfree(group_info[i]); kvfree(group_info); rcu_read_unlock(); } ext4_mb_avg_fragment_size_destroy(sbi); ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { ext4_msg(sb, KERN_INFO, "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_groups_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, "mballoc: %u generated and it took %llu", atomic_read(&sbi->s_mb_buddies_generated), atomic64_read(&sbi->s_mb_generation_time)); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } free_percpu(sbi->s_locality_groups); kfree(sbi->s_mb_last_groups); } static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + ext4_group_first_block_no(sb, block_group)); count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, struct ext4_free_data *entry) { struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0; mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; ext4_lock_group(sb, entry->efd_group); /* Take it out of per group rb tree */ rb_erase(&entry->efd_node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); /* * Clear the trimmed flag for the group so that the next * ext4_trim_fs can trim it. */ EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ folio_put(e4b.bd_buddy_folio); folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); mb_debug(sb, "freed %d blocks in 1 structures\n", count); } /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; LIST_HEAD(freed_data_list); struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; bool wake; list_replace_init(s_freed_head, &freed_data_list); list_for_each_entry(entry, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); if (test_opt(sb, DISCARD)) { spin_lock(&sbi->s_md_lock); wake = list_empty(&sbi->s_discard_list); list_splice_tail(&freed_data_list, &sbi->s_discard_list); spin_unlock(&sbi->s_md_lock); if (wake) queue_work(system_unbound_wq, &sbi->s_discard_work); } else { list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) kmem_cache_free(ext4_free_data_cachep, entry); } } int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) goto out; ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) goto out_pa_free; ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT); if (ext4_free_data_cachep == NULL) goto out_ac_free; return 0; out_ac_free: kmem_cache_destroy(ext4_ac_cachep); out_pa_free: kmem_cache_destroy(ext4_pspace_cachep); out: return -ENOMEM; } void ext4_exit_mballoc(void) { /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. */ rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); } #define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 #define EXT4_MB_SYNC_UPDATE 0x0002 static int ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, ext4_group_t group, ext4_grpblk_t blkoff, ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; int err; unsigned int i, already, changed = len; KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, handle, sb, state, group, blkoff, len, flags, ret_changed); if (ret_changed) *ret_changed = 0; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) return PTR_ERR(bitmap_bh); if (handle) { BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (err) goto out_err; } err = -EIO; gdp = ext4_get_group_desc(sb, group, &gdp_bh); if (!gdp) goto out_err; if (handle) { BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); if (err) goto out_err; } ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); } if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { already = 0; for (i = 0; i < len; i++) if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == state) already++; changed = len - already; } if (state) { mb_set_bits(bitmap_bh->b_data, blkoff, len); ext4_free_group_clusters_set(sb, gdp, ext4_free_group_clusters(sb, gdp) - changed); } else { mb_clear_bits(bitmap_bh->b_data, blkoff, len); ext4_free_group_clusters_set(sb, gdp, ext4_free_group_clusters(sb, gdp) + changed); } ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); if (ret_changed) *ret_changed = changed; if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, group); struct flex_groups *fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); if (state) atomic64_sub(changed, &fg->free_clusters); else atomic64_add(changed, &fg->free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); if (err) goto out_err; if (flags & EXT4_MB_SYNC_UPDATE) { sync_dirty_buffer(bitmap_bh); sync_dirty_buffer(gdp_bh); } out_err: brelse(bitmap_bh); return err; } /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { struct ext4_group_desc *gdp; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; int flags = 0; ext4_grpblk_t changed; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); sb = ac->ac_sb; sbi = EXT4_SB(sb); gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); if (!gdp) return -EIO; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, 0, NULL); if (!err) err = -EFSCORRUPTED; return err; } #ifdef AGGRESSIVE_CHECK flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, flags, &changed); if (err && changed == 0) return err; #ifdef AGGRESSIVE_CHECK BUG_ON(changed != ac->ac_b_ex.fe_len); #endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); return err; } /* * Idempotent helper for Ext4 fast commit replay path to set the state of * blocks in bitmaps and update counters. */ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, bool state) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; int err = 0; unsigned int clen, thisgrp_len; while (len > 0) { ext4_get_group_no_and_offset(sb, block, &group, &blkoff); /* * Check to see if we are freeing blocks across a group * boundary. * In case of flex_bg, this can happen that (block, len) may * span across more than one group. In that case we need to * get the corresponding group metadata to work with. * For this we have goto again loop. */ thisgrp_len = min_t(unsigned int, (unsigned int)len, EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); clen = EXT4_NUM_B2C(sbi, thisgrp_len); if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { ext4_error(sb, "Marking blocks in system zone - " "Block = %llu, len = %u", block, thisgrp_len); break; } err = ext4_mb_mark_context(NULL, sb, state, group, blkoff, clen, EXT4_MB_BITMAP_MARKED_CHECK | EXT4_MB_SYNC_UPDATE, NULL); if (err) break; block += thisgrp_len; len -= thisgrp_len; BUG_ON(len < 0); } } /* * here we normalize request for locality group * Group request are normalized to s_mb_group_prealloc, which goes to * s_strip if we set the same via mount option. * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } /* * This function returns the next element to look at during inode * PA rbtree walk. We assume that we have held the inode PA rbtree lock * (ei->i_prealloc_lock) * * new_start The start of the range we want to compare * cur_start The existing start that we are comparing against * node The node of the rb_tree */ static inline struct rb_node* ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) { if (new_start < cur_start) return node->rb_left; else return node->rb_right; } static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; ext4_lblk_t tmp_pa_start; loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); spin_unlock(&tmp_pa->pa_lock); } read_unlock(&ei->i_prealloc_lock); } /* * Given an allocation context "ac" and a range "start", "end", check * and adjust boundaries if the range overlaps with any of the existing * preallocatoins stored in the corresponding inode of the allocation context. * * Parameters: * ac allocation context * start start of the new range * end end of the new range */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; /* * Adjust the normalized range so that it doesn't overlap with any * existing preallocated blocks(PAs). Make sure to hold the rbtree lock * so it doesn't change underneath us. */ read_lock(&ei->i_prealloc_lock); /* Step 1: find any one immediate neighboring PA of the normalized range */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, tmp_pa_start, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || ac->ac_o_ex.fe_logical < tmp_pa_start)); spin_unlock(&tmp_pa->pa_lock); } /* * Step 2: check if the found PA is left or right neighbor and * get the other neighbor */ if (tmp_pa) { if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { struct rb_node *tmp; left_pa = tmp_pa; tmp = rb_next(&left_pa->pa_node.inode_node); if (tmp) { right_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } } else { struct rb_node *tmp; right_pa = tmp_pa; tmp = rb_prev(&right_pa->pa_node.inode_node); if (tmp) { left_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } } } /* Step 3: get the non deleted neighbors */ if (left_pa) { for (iter = &left_pa->pa_node.inode_node;; iter = rb_prev(iter)) { if (!iter) { left_pa = NULL; break; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); left_pa = tmp_pa; spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { spin_unlock(&tmp_pa->pa_lock); break; } spin_unlock(&tmp_pa->pa_lock); } } if (right_pa) { for (iter = &right_pa->pa_node.inode_node;; iter = rb_next(iter)) { if (!iter) { right_pa = NULL; break; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); right_pa = tmp_pa; spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { spin_unlock(&tmp_pa->pa_lock); break; } spin_unlock(&tmp_pa->pa_lock); } } if (left_pa) { left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } if (right_pa) { right_pa_start = right_pa->pa_lstart; BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); } /* Step 4: trim our normalized range to not overlap with the neighbors */ if (left_pa) { if (left_pa_end > new_start) new_start = left_pa_end; } if (right_pa) { if (right_pa_start < new_end) new_end = right_pa_start; } read_unlock(&ei->i_prealloc_lock); /* XXX: extra loop to check we really don't overlap preallocations */ ext4_mb_pa_assert_overlap(ac, new_start, new_end); *start = new_start; *end = new_end; } /* * Normalization means making request better in terms of * size and alignment */ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; /* do normalize only data requests, metadata requests do not need preallocation */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; /* sometime caller may want exact blocks */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; /* caller may indicate that preallocation isn't * required (it's a tail, for example) */ if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) return; if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { ext4_mb_normalize_group_request(ac); return ; } bsbits = ac->ac_sb->s_blocksize_bits; /* first, let's learn actual file size * given current request is allocated */ size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); orig_size = size; /* max size of free chunks */ max = 2 << bsbits; #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ /* XXX: should this table be tunable? */ start_off = 0; if (size <= 16 * 1024) { size = 16 * 1024; } else if (size <= 32 * 1024) { size = 32 * 1024; } else if (size <= 64 * 1024) { size = 64 * 1024; } else if (size <= 128 * 1024) { size = 128 * 1024; } else if (size <= 256 * 1024) { size = 256 * 1024; } else if (size <= 512 * 1024) { size = 512 * 1024; } else if (size <= 1024 * 1024) { size = 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (21 - bsbits)) << 21; size = 2 * 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; size = 8 * 1024 * 1024; } else { start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; size = (loff_t) EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; start = start_off >> bsbits; /* * For tiny groups (smaller than 8MB) the chosen allocation * alignment may be larger than group size. Make sure the * alignment does not move allocation to a different group which * makes mballoc fail assertions later. */ start = max(start, rounddown(ac->ac_o_ex.fe_logical, (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); /* avoid unnecessary preallocation that may trigger assertions */ if (start + size > EXT_MAX_BLOCKS) size = EXT_MAX_BLOCKS - start; /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; start = ar->lleft + 1; } if (ar->pright && start + size - 1 >= ar->lright) size -= start + size - ar->lright; /* * Trim allocation request for filesystems with artificially small * groups. */ if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); end = start + size; ext4_mb_pa_adjust_overlap(ac, &start, &end); size = end - start; /* * In this function "start" and "size" are normalized for better * alignment and length such that we could preallocate more blocks. * This normalization is done such that original request of * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and * "size" boundaries. * (Note fe_len can be relaxed since FS block allocation API does not * provide gurantee on number of contiguous blocks allocation since that * depends upon free space left, etc). * In case of inode pa, later we use the allocated blocks * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated * range of goal/best blocks [start, size] to put it at the * ac_o_ex.fe_logical extent of this inode. * (See ext4_mb_use_inode_pa() for more details) */ if (start + size <= ac->ac_o_ex.fe_logical || start > ac->ac_o_ex.fe_logical) { ext4_msg(ac->ac_sb, KERN_ERR, "start %lu, size %lu, fe_logical %lu", (unsigned long) start, (unsigned long) size, (unsigned long) ac->ac_o_ex.fe_logical); BUG(); } BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size)) && ar->pright >= size && ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { /* merge to the right */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, &ac->ac_g_ex.fe_group, &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } if (ar->pleft && (ar->lleft + 1 == start) && ar->pleft + 1 < ext4_blocks_count(es)) { /* merge to the left */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, &ac->ac_g_ex.fe_group, &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, orig_size, start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); for (int i=0; i<EXT4_MB_NUM_CRS; i++) { atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]); } atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); /* did we allocate as much as normalizer originally wanted? */ if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) atomic_inc(&sbi->s_bal_len_goals); if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) trace_ext4_mballoc_alloc(ac); else trace_ext4_mballoc_prealloc(ac); } /* * Called on failure; free up any blocks from the inode PA for this * context. We don't need this for MB_GROUP_PA because we only change * pa_free in ext4_mb_release_context(), but on failure, we've already * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. */ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; struct ext4_buddy e4b; int err; if (pa == NULL) { if (ac->ac_f_ex.fe_len == 0) return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); if (WARN_RATELIMIT(err, "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the * pages in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ return; ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); ext4_mb_unload_buddy(&e4b); return; } if (pa->pa_type == MB_INODE_PA) { spin_lock(&pa->pa_lock); pa->pa_free += ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); } } /* * use blocks preallocated to inode */ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); BUG_ON(ac->ac_b_ex.fe_len <= 0); pa->pa_free -= len; mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); } /* * use blocks preallocated to locality group */ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { unsigned int len = ac->ac_o_ex.fe_len; ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; /* we don't correct pa_pstart or pa_len here to avoid * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", pa->pa_lstart, len, pa); } /* * Return the prealloc space that have minimal distance * from the goal block. @cpa is the prealloc * space that is having currently known minimal distance * from the goal block. */ static struct ext4_prealloc_space * ext4_mb_check_group_pa(ext4_fsblk_t goal_block, struct ext4_prealloc_space *pa, struct ext4_prealloc_space *cpa) { ext4_fsblk_t cur_distance, new_distance; if (cpa == NULL) { atomic_inc(&pa->pa_count); return pa; } cur_distance = abs(goal_block - cpa->pa_pstart); new_distance = abs(goal_block - pa->pa_pstart); if (cur_distance <= new_distance) return cpa; /* drop the previous reference */ atomic_dec(&cpa->pa_count); atomic_inc(&pa->pa_count); return pa; } /* * check if found pa meets EXT4_MB_HINT_GOAL_ONLY */ static bool ext4_mb_pa_goal_check(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))) return true; /* * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted * in ext4_mb_normalize_request and will keep same with ac_o_ex * from ext4_mb_initialize_context. Choose ac_g_ex here to keep * consistent with ext4_mb_find_by_goal. */ start = pa->pa_pstart + (ac->ac_g_ex.fe_logical - pa->pa_lstart); if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start) return false; if (ac->ac_g_ex.fe_len > pa->pa_len - EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart)) return false; return true; } /* * search goal blocks in preallocated space */ static noinline_for_stack bool ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; struct rb_node *iter; ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return false; /* * first, try per-file preallocation by searching the inode pa rbtree. * * Here, we can't do a direct traversal of the tree because * ext4_mb_discard_group_preallocation() can paralelly mark the pa * deleted and that can cause direct traversal to skip some entries. */ read_lock(&ei->i_prealloc_lock); if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { goto try_group_pa; } /* * Step 1: Find a pa with logical start immediately adjacent to the * original logical start. This could be on the left or right. * * (tmp_pa->pa_lstart never changes so we can skip locking for it). */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, tmp_pa->pa_lstart, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); } /* * Step 2: The adjacent pa might be to the right of logical start, find * the left adjacent pa. After this step we'd have a valid tmp_pa whose * logical start is towards the left of original request's logical start */ if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { struct rb_node *tmp; tmp = rb_prev(&tmp_pa->pa_node.inode_node); if (tmp) { tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } else { /* * If there is no adjacent pa to the left then finding * an overlapping pa is not possible hence stop searching * inode pa tree */ goto try_group_pa; } } BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); /* * Step 3: If the left adjacent pa is deleted, keep moving left to find * the first non deleted adjacent pa. After this step we should have a * valid tmp_pa which is guaranteed to be non deleted. */ for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { if (!iter) { /* * no non deleted left adjacent pa, so stop searching * inode pa tree */ goto try_group_pa; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { /* * We will keep holding the pa_lock from * this point on because we don't want group discard * to delete this pa underneath us. Since group * discard is anyways an ENOSPC operation it * should be okay for it to wait a few more cycles. */ break; } else { spin_unlock(&tmp_pa->pa_lock); } } BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); BUG_ON(tmp_pa->pa_deleted == 1); /* * Step 4: We now have the non deleted left adjacent pa. Only this * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > EXT4_MAX_BLOCK_FILE_PHYS)) { /* * Since PAs don't overlap, we won't find any other PA to * satisfy this. */ spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); read_unlock(&ei->i_prealloc_lock); return true; } else { /* * We found a valid overlapping pa but couldn't use it because * it had no free blocks. This should ideally never happen * because: * * 1. When a new inode pa is added to rbtree it must have * pa_free > 0 since otherwise we won't actually need * preallocation. * * 2. An inode pa that is in the rbtree can only have it's * pa_free become zero when another thread calls: * ext4_mb_new_blocks * ext4_mb_use_preallocated * ext4_mb_use_inode_pa * * 3. Further, after the above calls make pa_free == 0, we will * immediately remove it from the rbtree in: * ext4_mb_new_blocks * ext4_mb_release_context * ext4_mb_put_pa * * 4. Since the pa_free becoming 0 and pa_free getting removed * from tree both happen in ext4_mb_new_blocks, which is always * called with i_data_sem held for data allocations, we can be * sure that another process will never see a pa in rbtree with * pa_free == 0. */ WARN_ON_ONCE(tmp_pa->pa_free == 0); } spin_unlock(&tmp_pa->pa_lock); try_group_pa: read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) return false; /* inode may have no locality group for some reason */ lg = ac->ac_lg; if (lg == NULL) return false; order = fls(ac->ac_o_ex.fe_len) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], pa_node.lg_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { cpa = ext4_mb_check_group_pa(goal_block, tmp_pa, cpa); } spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); } if (cpa) { ext4_mb_use_group_pa(ac, cpa); return true; } return false; } /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held */ static noinline_for_stack void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_prealloc_space *pa; struct list_head *cur; ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; int len; if (!grp) return; /* all form of preallocation discards first load group, * so the only competing code is preallocation use. * we don't need any locking here * notice we do NOT ignore preallocations with pa_deleted * otherwise we could leave used blocks available for * allocation in buddy when concurrent ext4_mb_put_pa() * is dropping preallocation */ list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &start); len = pa->pa_len; spin_unlock(&pa->pa_lock); if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); mb_set_bits(bitmap, start, len); preallocated += len; } mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } static void ext4_mb_mark_pa_deleted(struct super_block *sb, struct ext4_prealloc_space *pa) { struct ext4_inode_info *ei; if (pa->pa_deleted) { ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", pa->pa_type, pa->pa_pstart, pa->pa_lstart, pa->pa_len); return; } pa->pa_deleted = 1; if (pa->pa_type == MB_INODE_PA) { ei = EXT4_I(pa->pa_inode); atomic_dec(&ei->i_prealloc_active); } } static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) { BUG_ON(!pa); BUG_ON(atomic_read(&pa->pa_count)); BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); ext4_mb_pa_free(pa); } /* * drops a reference to preallocated space descriptor * if this was the last reference and the space is consumed */ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; ext4_fsblk_t grp_blk; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { spin_unlock(&pa->pa_lock); return; } if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; } ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; /* * If doing group-based preallocation, pa_pstart may be in the * next group when pa is used up */ if (pa->pa_type == MB_GROUP_PA) grp_blk--; grp = ext4_get_group_number(sb, grp_blk); /* * possible race: * * P1 (buddy init) P2 (regular allocation) * find block B in PA * copy on-disk bitmap to buddy * mark B in on-disk bitmap * drop PA from group * mark all PAs in buddy * * thus, P1 initializes buddy with B available. to prevent this * we make "copy" and "mark all PAs" atomic and serialize "drop PA" * against that pair */ ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); if (pa->pa_type == MB_INODE_PA) { write_lock(pa->pa_node_lock.inode_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); write_unlock(pa->pa_node_lock.inode_lock); ext4_mb_pa_free(pa); } else { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) { struct rb_node **iter = &root->rb_node, *parent = NULL; struct ext4_prealloc_space *iter_pa, *new_pa; ext4_lblk_t iter_start, new_start; while (*iter) { iter_pa = rb_entry(*iter, struct ext4_prealloc_space, pa_node.inode_node); new_pa = rb_entry(new, struct ext4_prealloc_space, pa_node.inode_node); iter_start = iter_pa->pa_lstart; new_start = new_pa->pa_lstart; parent = *iter; if (new_start < iter_start) iter = &((*iter)->rb_left); else iter = &((*iter)->rb_right); } rb_link_node(new, parent, iter); rb_insert_color(new, root); } /* * creates new preallocated space for given inode */ static noinline_for_stack void ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(ac->ac_pa == NULL); pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { struct ext4_free_extent ex = { .fe_logical = ac->ac_g_ex.fe_logical, .fe_len = ac->ac_orig_goal_len, }; loff_t orig_goal_end = extent_logical_end(sbi, &ex); loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex); /* * We can't allocate as much as normalizer wants, so we try * to get proper lstart to cover the original request, except * when the goal doesn't cover the original request as below: * * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048 * best_ex:0/200(200) -> adjusted: 1848/2048(200) */ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); /* * Use the below logic for adjusting best extent as it keeps * fragmentation in check while ensuring logical range of best * extent doesn't overflow out of goal extent: * * 1. Check if best ex can be kept at end of goal (before * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and * still cover original end * 3. Else, keep the best ex at start of original request. */ ex.fe_len = ac->ac_b_ex.fe_len; ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; ex.fe_logical = ac->ac_g_ex.fe_logical; if (o_ex_end <= extent_logical_end(sbi, &ex)) goto adjust_bex; ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ext4_mb_use_inode_pa(ac, pa); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); if (!grp) return; pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); write_lock(pa->pa_node_lock.inode_lock); ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); write_unlock(pa->pa_node_lock.inode_lock); atomic_inc(&ei->i_prealloc_active); } /* * creates new preallocated space for locality group inodes belongs to */ static noinline_for_stack void ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg; struct ext4_prealloc_space *pa; struct ext4_group_info *grp; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(ac->ac_pa == NULL); pa = ac->ac_pa; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_node.lg_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); if (!grp) return; lg = ac->ac_lg; BUG_ON(lg == NULL); pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); /* * We will later add the new pa to the right bucket * after updating the pa_free in ext4_mb_release_context */ } static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) { if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) ext4_mb_new_group_pa(ac); else ext4_mb_new_inode_pa(ac); } /* * finds all unused blocks in on-disk bitmap, frees them in * in-core bitmap and buddy. * @pa must be unlinked from inode and group lists, so that * nobody else can find/use it. * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int end; unsigned int next; ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); mb_debug(sb, "free preallocated %u/%u in group %u\n", (unsigned) ext4_group_first_block_no(sb, group) + bit, (unsigned) next - bit, (unsigned) group); free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + EXT4_C2B(sbi, bit)), next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } if (free != pa->pa_free) { ext4_msg(e4b->bd_sb, KERN_CRIT, "pa %p: logic %lu, phys. %lu, len %d", pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. */ } atomic_add(free, &sbi->s_mb_discarded); } static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); } /* * releases all preallocations in given group * * first, we need to decide discard policy: * - when do we discard * 1) ENOSPC * - how many do we discard * 1) how many requested */ static noinline_for_stack int ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_group_t group, int *busy) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; int free = 0; if (!grp) return 0; mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) goto out_dbg; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); goto out_dbg; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_warning(sb, "Error %d loading buddy information for %u", err, group); put_bh(bitmap_bh); goto out_dbg; } ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { spin_unlock(&pa->pa_lock); *busy = 1; continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* seems this one can be freed ... */ ext4_mb_mark_pa_deleted(sb, pa); if (!free) this_cpu_inc(discard_pa_seq); /* we can trust pa_free ... */ free += pa->pa_free; spin_unlock(&pa->pa_lock); list_del(&pa->pa_group_list); list_add(&pa->u.pa_tmp_list, &list); } /* now free all selected PAs */ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { /* remove from object (inode or locality group) */ if (pa->pa_type == MB_GROUP_PA) { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); } else { write_lock(pa->pa_node_lock.inode_lock); ei = EXT4_I(pa->pa_inode); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); write_unlock(pa->pa_node_lock.inode_lock); } list_del(&pa->u.pa_tmp_list); if (pa->pa_type == MB_GROUP_PA) { ext4_mb_release_group_pa(&e4b, pa); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } else { ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_mb_pa_free(pa); } } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); out_dbg: mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", free, group, grp->bb_free); return free; } /* * releases all non-used preallocated blocks for given inode * * It's important to discard preallocations under i_data_sem * We don't want another block to be served from the prealloc * space when we are discarding the inode prealloc space. * * FIXME!! Make sure it is valid at all the call sites */ void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; if (!S_ISREG(inode->i_mode)) return; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); write_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; } if (pa->pa_deleted == 0) { ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); continue; } /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); write_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from * the list. as we might be called from * ->clear_inode() the inode will get freed * and concurrent thread which is unlinking * pa from inode's list may access already * freed memory, bad-bad-bad */ /* XXX: if this happens too often, we can * add a flag to force wait only in case * of ->clear_inode(), but not in case of * regular truncate */ schedule_timeout_uninterruptible(HZ); goto repeat; } write_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error_err(sb, -err, "Error %d loading buddy information for %u", err, group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); ext4_mb_pa_free(pa); } } static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa; BUG_ON(ext4_pspace_cachep == NULL); pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); if (!pa) return -ENOMEM; atomic_set(&pa->pa_count, 1); ac->ac_pa = pa; return 0; } static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; BUG_ON(!pa); ac->ac_pa = NULL; WARN_ON(!atomic_dec_and_test(&pa->pa_count)); /* * current function is only called due to an error or due to * len of found blocks < len of requested blocks hence the PA has not * been added to grp->bb_prealloc_list. So we don't need to lock it */ pa->pa_deleted = 1; ext4_mb_pa_free(pa); } #ifdef CONFIG_EXT4_DEBUG static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); mb_debug(sb, "groups: "); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; struct list_head *cur; if (!grp) continue; ext4_lock_group(sb, i); list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); mb_debug(sb, "PA:%u:%d:%d\n", i, start, pa->pa_len); } ext4_unlock_group(sb, i); mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, grp->bb_fragments); } } static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" " Allocation context details:"); mb_debug(sb, "status %u flags 0x%x", ac->ac_status, ac->ac_flags); mb_debug(sb, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, (unsigned long)ac->ac_o_ex.fe_logical, (unsigned long)ac->ac_g_ex.fe_group, (unsigned long)ac->ac_g_ex.fe_start, (unsigned long)ac->ac_g_ex.fe_len, (unsigned long)ac->ac_g_ex.fe_logical, (unsigned long)ac->ac_b_ex.fe_group, (unsigned long)ac->ac_b_ex.fe_start, (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); if (ac->ac_pa) mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? "group pa" : "inode pa"); ext4_mb_show_pa(sb); } #else static inline void ext4_mb_show_pa(struct super_block *sb) { } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); } #endif /* * We use locality group preallocation for small size file. The size of the * file is determined by the current size or the resulting size after * allocation which ever is larger * * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits = ac->ac_sb->s_blocksize_bits; loff_t size, isize; bool inode_pa_eligible, group_pa_eligible; if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; /* No point in using inode preallocation for closed files */ if ((size == isize) && !ext4_fs_is_busy(sbi) && !inode_is_open_for_write(ac->ac_inode)) inode_pa_eligible = false; size = max(size, isize); /* Don't use group allocation for large files */ if (size > sbi->s_mb_stream_request) group_pa_eligible = false; if (!group_pa_eligible) { if (inode_pa_eligible) ac->ac_flags |= EXT4_MB_STREAM_ALLOC; else ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } BUG_ON(ac->ac_lg != NULL); /* * locality group prealloc space are per cpu. The reason for having * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; /* serialize all allocations in the group */ mutex_lock(&ac->ac_lg->lg_mutex); } static noinline_for_stack void ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; unsigned int len; ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ len = ar->len; /* just a dirty hack to filter too big requests */ if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; ac->ac_flags = ar->flags; /* we have to define context: we'll work with a file or * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, inode_is_open_for_write(ar->inode) ? "" : "non-"); } static noinline_for_stack void ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_locality_group *lg, int order, int total_entries) { ext4_group_t group = 0; struct ext4_buddy e4b; LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* * This is the pa that we just used * for block allocation. So don't * free that */ spin_unlock(&pa->pa_lock); continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* only lg prealloc space */ BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_node.lg_list); list_add(&pa->u.pa_tmp_list, &discard_list); total_entries--; if (total_entries <= 5) { /* * we want to keep only 5 entries * allowing it to grow to 8. This * mak sure we don't call discard * soon for this list. */ break; } } spin_unlock(&lg->lg_prealloc_lock); list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { int err; group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error_err(sb, -err, "Error %d loading buddy information for %u", err, group); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } /* * We have incremented pa_count. So it cannot be freed at this * point. Also we hold lg_mutex. So no parallel allocation is * possible from this lg. That means pa_free cannot be updated. * * A parallel ext4_mb_discard_group_preallocations is possible. * which can cause the lg_prealloc_list to be updated. */ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) { int order, added = 0, lg_prealloc_count = 1; struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; order = fls(pa->pa_free) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { spin_unlock(&tmp_pa->pa_lock); continue; } if (!added && pa->pa_free < tmp_pa->pa_free) { /* Add to the tail of the previous entry */ list_add_tail_rcu(&pa->pa_node.lg_list, &tmp_pa->pa_node.lg_list); added = 1; /* * we want to count the total * number of entries in the list */ } spin_unlock(&tmp_pa->pa_lock); lg_prealloc_count++; } if (!added) list_add_tail_rcu(&pa->pa_node.lg_list, &lg->lg_prealloc_list[order]); spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); } /* * release all resource we used in allocation */ static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding * doesn't grow big. */ if (likely(pa->pa_free)) { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); ext4_mb_add_n_trim(ac); } } ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_folio) folio_put(ac->ac_bitmap_folio); if (ac->ac_buddy_folio) folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0, busy = 0; int retry = 0; trace_ext4_mb_discard_preallocations(sb, needed); if (needed == 0) needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; repeat: for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, &busy); freed += ret; needed -= ret; cond_resched(); } if (needed > 0 && busy && ++retry < 3) { busy = 0; goto repeat; } return freed; } static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, struct ext4_allocation_context *ac, u64 *seq) { int freed; u64 seq_retry = 0; bool ret = false; freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); if (freed) { ret = true; goto out_dbg; } seq_retry = ext4_get_discard_pa_seq_sum(); if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { ac->ac_flags |= EXT4_MB_STRICT_CHECK; *seq = seq_retry; ret = true; } out_dbg: mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); return ret; } /* * Simple allocator for Ext4 fast commit replay path. It searches for blocks * linearly starting at the goal block and also excludes the blocks which * are going to be in use after fast commit replay. */ static ext4_fsblk_t ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) { struct buffer_head *bitmap_bh; struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group, nr; ext4_grpblk_t blkoff; ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); ar->len = 0; ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { *errp = PTR_ERR(bitmap_bh); pr_warn("Failed to read block bitmap\n"); return 0; } while (1) { i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); if (i >= max) break; if (ext4_fc_replay_check_excluded(sb, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i))) { blkoff = i + 1; } else break; } brelse(bitmap_bh); if (i < max) break; if (++group >= ext4_get_groups_count(sb)) group = 0; blkoff = 0; } if (i >= max) { *errp = -ENOSPC; return 0; } block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; *errp = 0; return block; } /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; int retries = 0; u64 seq; might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); trace_ext4_request_blocks(ar); if (sbi->s_mount_state & EXT4_FC_REPLAY) return ext4_mb_new_blocks_simple(ar, errp); /* Allow to use superuser reservation for quota file */ if (ext4_is_quota_file(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ while (ar->len && ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { ext4_mb_show_pa(sb); *errp = -ENOSPC; return 0; } reserv_clstrs = ar->len; if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { dquot_alloc_block_nofail(ar->inode, EXT4_C2B(sbi, ar->len)); } else { while (ar->len && dquot_alloc_block(ar->inode, EXT4_C2B(sbi, ar->len))) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } } inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; goto out; } } ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; goto out; } ext4_mb_initialize_context(ac, ar); ac->ac_op = EXT4_MB_HISTORY_PREALLOC; seq = this_cpu_read(discard_pa_seq); if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); *errp = ext4_mb_pa_alloc(ac); if (*errp) goto errout; repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); /* * pa allocated above is added to grp->bb_prealloc_list only * when we were able to allocate some block i.e. when * ac->ac_status == AC_STATUS_FOUND. * And error from above mean ac->ac_status != AC_STATUS_FOUND * So we have to free this pa here itself. */ if (*errp) { ext4_mb_pa_put_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } if (ac->ac_status == AC_STATUS_FOUND && ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) ext4_mb_pa_put_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } } else { if (++retries < 3 && ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; /* * If block allocation fails then the pa allocated above * needs to be freed here itself. */ ext4_mb_pa_put_free(ac); *errp = -ENOSPC; } if (*errp) { errout: ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } ext4_mb_release_context(ac); kmem_cache_free(ext4_ac_cachep, ac); out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); } trace_ext4_allocate_blocks(ar, (unsigned long long)block); return block; } /* * We can merge two free data extents only if the physical blocks * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ static inline bool ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1, struct ext4_free_data *entry2) { if (entry1->efd_tid != entry2->efd_tid) return false; if (entry1->efd_start_cluster + entry1->efd_count != entry2->efd_start_cluster) return false; if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group)) return false; return true; } static inline void ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root, struct ext4_free_data *entry1, struct ext4_free_data *entry2) { entry1->efd_count += entry2->efd_count; spin_lock(&sbi->s_md_lock); list_del(&entry2->efd_list); spin_unlock(&sbi->s_md_lock); rb_erase(&entry2->efd_node, root); kmem_cache_free(ext4_free_data_cachep, entry2); } static inline void ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root, struct ext4_free_data *entry) { struct ext4_free_data *prev; struct rb_node *node; node = rb_prev(&entry->efd_node); if (!node) return; prev = rb_entry(node, struct ext4_free_data, efd_node); if (ext4_freed_extents_can_be_merged(prev, entry)) ext4_merge_freed_extents(sbi, root, prev, entry); } static inline void ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root, struct ext4_free_data *entry) { struct ext4_free_data *next; struct rb_node *node; node = rb_next(&entry->efd_node); if (!node) return; next = rb_entry(node, struct ext4_free_data, efd_node); if (ext4_freed_extents_can_be_merged(entry, next)) ext4_merge_freed_extents(sbi, root, entry, next); } static noinline_for_stack void ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; ext4_grpblk_t clusters = new_entry->efd_count; struct ext4_free_data *entry = NULL; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct rb_root *root = &db->bb_free_root; struct rb_node **n = &root->rb_node; struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_folio == NULL); BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to protect buddy cache from being freed, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ folio_get(e4b->bd_buddy_folio); folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_free_data, efd_node); if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); kmem_cache_free(ext4_free_data_cachep, new_entry); return; } } atomic_add(clusters, &sbi->s_mb_free_pending); if (!entry) goto insert; /* Now try to see the extent can be merged to prev and next */ if (ext4_freed_extents_can_be_merged(new_entry, entry)) { entry->efd_start_cluster = cluster; entry->efd_count += new_entry->efd_count; kmem_cache_free(ext4_free_data_cachep, new_entry); ext4_try_merge_freed_extent_prev(sbi, root, entry); return; } if (ext4_freed_extents_can_be_merged(entry, new_entry)) { entry->efd_count += new_entry->efd_count; kmem_cache_free(ext4_free_data_cachep, new_entry); ext4_try_merge_freed_extent_next(sbi, root, entry); return; } insert: rb_link_node(new_node, parent, n); rb_insert_color(new_node, root); spin_lock(&sbi->s_md_lock); list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); spin_unlock(&sbi->s_md_lock); } static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { struct super_block *sb = inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; ext4_get_group_no_and_offset(sb, block, &group, &blkoff); ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, EXT4_MB_BITMAP_MARKED_CHECK | EXT4_MB_SYNC_UPDATE, NULL); } /** * ext4_mb_clear_bb() -- helper function for freeing blocks. * Used by ext4_free_blocks() * @handle: handle for this transaction * @inode: inode * @block: starting physical block to be freed * @count: number of blocks to be freed * @flags: flags used by ext4_free_blocks */ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int flags) { struct super_block *sb = inode->i_sb; struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; int mark_flags = 0; ext4_grpblk_t changed; sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_out; } flags |= EXT4_FREE_BLOCKS_VALIDATED; do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); grp = ext4_get_group_info(sb, block_group); if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return; /* * Check to see if we are freeing blocks across a group * boundary. */ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { overflow = EXT4_C2B(sbi, bit) + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } count_clusters = EXT4_NUM_B2C(sbi, count); trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) goto error_out; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_clean; } #ifdef AGGRESSIVE_CHECK mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif err = ext4_mb_mark_context(handle, sb, false, block_group, bit, count_clusters, mark_flags, &changed); if (err && changed == 0) goto error_clean; #ifdef AGGRESSIVE_CHECK BUG_ON(changed != count_clusters); #endif /* * We need to make sure we don't reuse the freed block until after the * transaction is committed. We make an exception if the inode is to be * written in writeback mode since writeback mode has weak data * consistency guarantees. */ if (ext4_handle_valid(handle) && ((flags & EXT4_FREE_BLOCKS_METADATA) || !ext4_should_writeback_data(inode))) { struct ext4_free_data *new_entry; /* * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed * to fail. */ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS|__GFP_NOFAIL); new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; new_entry->efd_count = count_clusters; new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, count_clusters); /* * Ignore EOPNOTSUPP error. This is consistent with * what happens when using journal. */ if (err == -EOPNOTSUPP) err = 0; if (err) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, err); } EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); mb_free_blocks(inode, &e4b, bit, count_clusters); } ext4_unlock_group(sb, block_group); /* * on a bigalloc file system, defer the s_freeclusters_counter * update to the caller (ext4_remove_space and friends) so they * can determine if a cluster freed here should be rereserved */ if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); } if (overflow && !err) { block += count; count = overflow; ext4_mb_unload_buddy(&e4b); /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; } error_clean: ext4_mb_unload_buddy(&e4b); error_out: ext4_std_error(sb, err); } /** * ext4_free_blocks() -- Free given blocks and update quota * @handle: handle for this transaction * @inode: inode * @bh: optional buffer of the block to be freed * @block: starting physical block to be freed * @count: number of blocks to be freed * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags) { struct super_block *sb = inode->i_sb; unsigned int overflow; struct ext4_sb_info *sbi; sbi = EXT4_SB(sb); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); else block = bh->b_blocknr; } if (sbi->s_mount_state & EXT4_FC_REPLAY) { ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count)); return; } might_sleep(); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); return; } flags |= EXT4_FREE_BLOCKS_VALIDATED; ext4_debug("freeing block %llu\n", block); trace_ext4_free_blocks(inode, block, count, flags); if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { BUG_ON(count > 1); ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, bh, block); } /* * If the extent to be freed does not begin on a cluster * boundary, we need to deal with partial clusters at the * beginning and end of the extent. Normally we will free * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; block += overflow; if (count > overflow) count -= overflow; else return; } else { block -= overflow; count += overflow; } /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) count -= overflow; else return; } else count += sbi->s_cluster_ratio - overflow; /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { int i; int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) bh = sb_find_get_block_nonatomic(inode->i_sb, block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } ext4_mb_clear_bb(handle, inode, block, count, flags); } /** * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physical block to add to the block group * @count: number of blocks to free * * This marks the blocks as free in the bitmap and buddy. */ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { ext4_group_t block_group; ext4_grpblk_t bit; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; int err = 0; ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); unsigned long cluster_count = last_cluster - first_cluster + 1; ext4_grpblk_t changed; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); if (cluster_count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); /* * Check to see if we are freeing blocks across a group * boundary. */ if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; goto error_out; } err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_out; if (!ext4_sb_block_valid(sb, NULL, block, count)) { ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); err = -EINVAL; goto error_clean; } err = ext4_mb_mark_context(handle, sb, false, block_group, bit, cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, &changed); if (err && changed == 0) goto error_clean; if (changed != cluster_count) ext4_error(sb, "bit already cleared in group %u", block_group); ext4_lock_group(sb, block_group); mb_free_blocks(NULL, &e4b, bit, cluster_count); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, changed); error_clean: ext4_mb_unload_buddy(&e4b); error_out: ext4_std_error(sb, err); return err; } /** * ext4_trim_extent -- function to TRIM one single free extent in the group * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM * @e4b: ext4 buddy for the group * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ static int ext4_trim_extent(struct super_block *sb, int start, int count, struct ext4_buddy *e4b) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; ext4_group_t group = e4b->bd_group; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; ex.fe_group = group; ex.fe_len = count; /* * Mark blocks used, so no one can reuse them while * being trimmed. */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; } static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, ext4_group_t grp) { unsigned long nr_clusters_in_group; if (grp < (ext4_get_groups_count(sb) - 1)) nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb); else nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, grp)) >> EXT4_CLUSTER_BITS(sb); return nr_clusters_in_group - 1; } static bool ext4_trim_interrupted(void) { return fatal_signal_pending(current) || freezing(current); } static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { ext4_grpblk_t next, count, free_count, last, origin_start; bool set_trimmed = false; void *bitmap; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return 0; last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; if (start == 0 && max >= last) set_trimmed = true; origin_start = start; start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; while (start <= max) { start = mb_find_next_zero_bit(bitmap, max + 1, start); if (start > max) break; next = mb_find_next_bit(bitmap, last + 1, start); if (origin_start == 0 && next >= last) set_trimmed = true; if ((next - start) >= minblocks) { int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) return count; count += next - start; } free_count += next - start; start = next + 1; if (ext4_trim_interrupted()) return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); cond_resched(); ext4_lock_group(sb, e4b->bd_group); } if ((e4b->bd_info->bb_free - free_count) < minblocks) break; } if (set_trimmed) EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); return count; } /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy * bitmap. Then issue a TRIM command on this extent and free the extent in * the group buddy bitmap. */ static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; trace_ext4_trim_all_free(sb, group, start, max); ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_warning(sb, "Error %d loading buddy information for %u", ret, group); return ret; } ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); else ret = 0; ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", ret, group); return ret; } /** * ext4_trim_fs() -- trim ioctl handle function * @sb: superblock for filesystem * @range: fstrim_range structure * * start: First Byte to trim * len: number of Bytes to trim from start * minlen: minimum extent length in Bytes * ext4_trim_fs goes through all allocation groups containing Bytes from * start to start+len. For each such a group ext4_trim_all_free function * is invoked to trim all free space. */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = EXT4_NUM_B2C(EXT4_SB(sb), range->minlen >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; /* No point to try to trim less than discard granularity */ if (range->minlen < discard_granularity) { minlen = EXT4_NUM_B2C(EXT4_SB(sb), discard_granularity >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } if (end >= max_blks - 1) end = max_blks - 1; if (end <= first_data_blk) goto out; if (start < first_data_blk) start = first_data_blk; /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { if (ext4_trim_interrupted()) break; grp = ext4_get_group_info(sb, group); if (!grp) continue; /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } /* * For all the groups except the last one, last cluster will * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ if (group == last_group) end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, end, minlen); if (cnt < 0) { ret = cnt; break; } trimmed += cnt; } /* * For every group except the first one, we are sure * that the first cluster to discard will be cluster #0. */ first_cluster = 0; } if (!ret) EXT4_SB(sb)->s_last_trim_minblks = minlen; out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } /* Iterate all the free extents in the group. */ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, ext4_grpblk_t first, ext4_grpblk_t end, ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; ext4_grpblk_t start, next; struct ext4_buddy e4b; int error; error = ext4_mb_load_buddy(sb, group, &e4b); if (error) return error; bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); start = max(e4b.bd_info->bb_first_free, first); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; if (meta_formatter && start != first) { if (start > end) start = end; ext4_unlock_group(sb, group); error = meta_formatter(sb, group, first, start - first, priv); if (error) goto out_unload; ext4_lock_group(sb, group); } while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) break; next = mb_find_next_bit(bitmap, end + 1, start); ext4_unlock_group(sb, group); error = formatter(sb, group, start, next - start, priv); if (error) goto out_unload; ext4_lock_group(sb, group); start = next + 1; } ext4_unlock_group(sb, group); out_unload: ext4_mb_unload_buddy(&e4b); return error; } #ifdef CONFIG_EXT4_KUNIT_TESTS #include "mballoc-test.c" #endif
4849 99 4910 4 4017 4249 849 2 4845 170 1 848 614 6 4850 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H #include <linux/blk-mq.h> #include "blk-stat.h" struct blk_mq_tag_set; struct blk_mq_ctxs { struct kobject kobj; struct blk_mq_ctx __percpu *queue_ctx; }; /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_lists[HCTX_MAX_TYPES]; } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; #define BLK_MQ_CPU_WORK_BATCH (8) typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); /* * CPU -> queue mappings */ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); /* * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue * @q: request queue * @type: the hctx type index * @cpu: CPU */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, enum hctx_type type, unsigned int cpu) { return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); int blk_mq_sysfs_register(struct gendisk *disk); void blk_mq_sysfs_unregister(struct gendisk *disk); int blk_mq_sysfs_register_hctxs(struct request_queue *q); void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { return per_cpu_ptr(q->queue_ctx, cpu); } /* * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; blk_opf_t cmd_flags; req_flags_t rq_flags; /* allocate multiple requests/tags in one go */ unsigned int nr_tags; struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset); void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->rq_flags & RQF_SCHED_TAGS) return data->hctx->sched_tags; return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { /* Fast path: hardware queue is not stopped most of the time. */ if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return false; /* * This barrier is used to order adding of dispatch list before and * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier * in blk_mq_start_stopped_hw_queue() so that dispatch code could * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not * empty to avoid missing dispatching requests. */ smp_mb(); return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; } void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) { if (q->mq_ops->put_budget) q->mq_ops->put_budget(q, budget_token); } static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q); return 0; } static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) { if (token < 0) return; if (rq->q->mq_ops->set_rq_budget_token) rq->q->mq_ops->set_rq_budget_token(rq, token); } static inline int blk_mq_get_rq_budget_token(struct request *rq) { if (rq->q->mq_ops->get_rq_budget_token) return rq->q->mq_ops->get_rq_budget_token(rq); return -1; } static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_add(val, &hctx->nr_active); } static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_add_active_requests(hctx, 1); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_sub(val, &hctx->nr_active); } static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_sub_active_requests(hctx, 1); } static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_add_active_requests(hctx, val); } static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_inc_active_requests(hctx); } static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_sub_active_requests(hctx, val); } static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_dec_active_requests(hctx); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; } static inline void blk_mq_put_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) return; __blk_mq_put_driver_tag(rq->mq_hctx, rq); } bool __blk_mq_alloc_driver_tag(struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; return true; } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) qmap->mq_map[cpu] = 0; } /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { while (!list_empty(list)) { struct request *rq = list_entry_rq(list->next); list_del_init(&rq->queuelist); blk_mq_free_request(rq); } } /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { unsigned int depth, users; if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return true; /* * Don't try dividing an ant */ if (bt->sb.depth == 1) return true; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; } users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; /* * Allow at least some tags */ depth = max((bt->sb.depth + users - 1) / users, 4U); return __blk_mq_active_requests(hctx) < depth; } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ srcu_idx = srcu_read_lock(__tag_set->srcu); \ (dispatch_ops); \ srcu_read_unlock(__tag_set->srcu, srcu_idx); \ } else { \ rcu_read_lock(); \ (dispatch_ops); \ rcu_read_unlock(); \ } \ } while (0) #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ static inline bool blk_mq_can_poll(struct request_queue *q) { return (q->limits.features & BLK_FEAT_POLL) && q->tag_set->map[HCTX_TYPE_POLL].nr_queues; } #endif
5 1 1 1 1 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/tcp.h> #include <net/ip.h> #include <net/tcp.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netfilter/nfnetlink_osf.h> /* * Indexed by dont-fragment bit. * It is the only constant value in the fingerprint. */ struct list_head nf_osf_fingers[2]; EXPORT_SYMBOL_GPL(nf_osf_fingers); static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); const struct in_ifaddr *ifa; int ret = 0; if (ttl_check == NF_OSF_TTL_TRUE) return ip->ttl == f_ttl; if (ttl_check == NF_OSF_TTL_NOCHECK) return 1; else if (ip->ttl <= f_ttl) return 1; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } return ret; } struct nf_osf_hdr_ctx { bool df; u16 window; u16 totlen; const unsigned char *optp; unsigned int optsize; }; static bool nf_osf_match_one(const struct sk_buff *skb, const struct nf_osf_user_finger *f, int ttl_check, struct nf_osf_hdr_ctx *ctx) { const __u8 *optpinit = ctx->optp; unsigned int check_WSS = 0; int fmatch = FMATCH_WRONG; int foptsize, optnum; u16 mss = 0; if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) return false; /* * Should not happen if userspace parser was written correctly. */ if (f->wss.wc >= OSF_WSS_MAX) return false; /* Check options */ foptsize = 0; for (optnum = 0; optnum < f->opt_num; ++optnum) foptsize += f->opt[optnum].length; if (foptsize > MAX_IPOPTLEN || ctx->optsize > MAX_IPOPTLEN || ctx->optsize != foptsize) return false; check_WSS = f->wss.wc; for (optnum = 0; optnum < f->opt_num; ++optnum) { if (f->opt[optnum].kind == *ctx->optp) { __u32 len = f->opt[optnum].length; const __u8 *optend = ctx->optp + len; fmatch = FMATCH_OK; switch (*ctx->optp) { case OSFOPT_MSS: mss = ctx->optp[3]; mss <<= 8; mss |= ctx->optp[2]; mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: break; } ctx->optp = optend; } else fmatch = FMATCH_OPT_WRONG; if (fmatch != FMATCH_OK) break; } if (fmatch != FMATCH_OPT_WRONG) { fmatch = FMATCH_WRONG; switch (check_WSS) { case OSF_WSS_PLAIN: if (f->wss.val == 0 || ctx->window == f->wss.val) fmatch = FMATCH_OK; break; case OSF_WSS_MSS: /* * Some smart modems decrease mangle MSS to * SMART_MSS_2, so we check standard, decreased * and the one provided in the fingerprint MSS * values. */ #define SMART_MSS_1 1460 #define SMART_MSS_2 1448 if (ctx->window == f->wss.val * mss || ctx->window == f->wss.val * SMART_MSS_1 || ctx->window == f->wss.val * SMART_MSS_2) fmatch = FMATCH_OK; break; case OSF_WSS_MTU: if (ctx->window == f->wss.val * (mss + 40) || ctx->window == f->wss.val * (SMART_MSS_1 + 40) || ctx->window == f->wss.val * (SMART_MSS_2 + 40)) fmatch = FMATCH_OK; break; case OSF_WSS_MODULO: if ((ctx->window % f->wss.val) == 0) fmatch = FMATCH_OK; break; } } if (fmatch != FMATCH_OK) ctx->optp = optpinit; return fmatch == FMATCH_OK; } static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, const struct sk_buff *skb, const struct iphdr *ip, unsigned char *opts, struct tcphdr *_tcph) { const struct tcphdr *tcp; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph); if (!tcp) return NULL; if (!tcp->syn) return NULL; ctx->totlen = ntohs(ip->tot_len); ctx->df = ntohs(ip->frag_off) & IP_DF; ctx->window = ntohs(tcp->window); if (tcp->doff * 4 > sizeof(struct tcphdr)) { ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + sizeof(struct tcphdr), ctx->optsize, opts); if (!ctx->optp) return NULL; } return tcp; } bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; int fcount = 0, ttl_check; int fmatch = FMATCH_WRONG; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) continue; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; fmatch = FMATCH_OK; fcount++; if (info->flags & NF_OSF_LOG) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest), f->ttl - ip->ttl); if ((info->flags & NF_OSF_LOG) && info->loglevel == NF_OSF_LOGLEVEL_FIRST) break; } if (!fcount && (info->flags & NF_OSF_LOG)) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); if (fcount) fmatch = FMATCH_OK; return fmatch == FMATCH_OK; } EXPORT_SYMBOL_GPL(nf_osf_match); bool nf_osf_find(const struct sk_buff *skb, const struct list_head *nf_osf_fingers, const int ttl_check, struct nf_osf_data *data) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; bool found = false; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; data->genre = f->genre; data->version = f->version; found = true; break; } return found; } EXPORT_SYMBOL_GPL(nf_osf_find); static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, }; static int nfnl_osf_add_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; int err = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); if (f->opt_num > ARRAY_SIZE(f->opt)) return -EINVAL; if (!memchr(f->genre, 0, MAXGENRELEN) || !memchr(f->subtype, 0, MAXGENRELEN) || !memchr(f->version, 0, MAXGENRELEN)) return -EINVAL; kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger)); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; kfree(kf); kf = NULL; if (info->nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } /* * We are protected by nfnl mutex. */ if (kf) list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]); return err; } static int nfnl_osf_remove_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *sf; int err = -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; /* * We are protected by nfnl mutex. */ list_del_rcu(&sf->finger_entry); kfree_rcu(sf, rcu_head); err = 0; break; } return err; } static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = nfnl_osf_add_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, [OSF_MSG_REMOVE] = { .call = nfnl_osf_remove_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, }; static const struct nfnetlink_subsystem nfnl_osf_subsys = { .name = "osf", .subsys_id = NFNL_SUBSYS_OSF, .cb_count = OSF_MSG_MAX, .cb = nfnl_osf_callbacks, }; static int __init nfnl_osf_init(void) { int err = -EINVAL; int i; for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) INIT_LIST_HEAD(&nf_osf_fingers[i]); err = nfnetlink_subsys_register(&nfnl_osf_subsys); if (err < 0) { pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); goto err_out_exit; } return 0; err_out_exit: return err; } static void __exit nfnl_osf_fini(void) { struct nf_osf_finger *f; int i; nfnetlink_subsys_unregister(&nfnl_osf_subsys); rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) { list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) { list_del_rcu(&f->finger_entry); kfree_rcu(f, rcu_head); } } rcu_read_unlock(); rcu_barrier(); } module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Passive OS fingerprint matching"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
514 1016 1014 1497 515 1 1360 1498 1499 25 25 3 514 515 514 92 92 92 92 92 91 3 3 3 3 3 92 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/init.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/tty.h> #include "internal.h" static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); #define CHRDEV_MAJOR_HASH_SIZE 255 static struct char_device_struct { struct char_device_struct *next; unsigned int major; unsigned int baseminor; int minorct; char name[64]; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; mutex_lock(&chrdevs_lock); for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); } mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ static int find_dynamic_major(void) { int i; struct char_device_struct *cd; for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; if (cd == NULL) return i; } return -EBUSY; } /* * Register a single major with a specified minor range. * * If major == 0 this function will dynamically allocate an unused major. * If major > 0 this function will attempt to reserve the range of minors * with given major. * */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { struct char_device_struct *cd, *curr, *prev = NULL; int ret; int i; if (major >= CHRDEV_MAJOR_MAX) { pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", name, major, CHRDEV_MAJOR_MAX-1); return ERR_PTR(-EINVAL); } if (minorct > MINORMASK + 1 - baseminor) { pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", name, baseminor, baseminor + minorct - 1, 0, MINORMASK); return ERR_PTR(-EINVAL); } cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); mutex_lock(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); goto out; } major = ret; } ret = -EBUSY; i = major_to_index(major); for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { if (curr->major < major) continue; if (curr->major > major) break; if (curr->baseminor + curr->minorct <= baseminor) continue; if (curr->baseminor >= baseminor + minorct) break; goto out; } cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; chrdevs[i] = cd; } else { cd->next = prev->next; prev->next = cd; } mutex_unlock(&chrdevs_lock); return cd; out: mutex_unlock(&chrdevs_lock); kfree(cd); return ERR_PTR(ret); } static struct char_device_struct * __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) { struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); mutex_lock(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && (*cp)->minorct == minorct) break; if (*cp) { cd = *cp; *cp = cd->next; } mutex_unlock(&chrdevs_lock); return cd; } /** * register_chrdev_region() - register a range of device numbers * @from: the first in the desired range of device numbers; must include * the major number. * @count: the number of consecutive device numbers required * @name: the name of the device or driver. * * Return value is zero on success, a negative error code on failure. */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; cd = __register_chrdev_region(MAJOR(n), MINOR(n), next - n, name); if (IS_ERR(cd)) goto fail; } return 0; fail: to = n; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } return PTR_ERR(cd); } /** * alloc_chrdev_region() - register a range of char device numbers * @dev: output parameter for first assigned number * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: the name of the associated device or driver * * Allocates a range of char device numbers. The major number will be * chosen dynamically, and returned (along with the first minor number) * in @dev. Returns zero or a negative error code. */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { struct char_device_struct *cd; cd = __register_chrdev_region(0, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); *dev = MKDEV(cd->major, cd->baseminor); return 0; } /** * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * * If @major == 0 this functions will dynamically allocate a major and return * its number. * * If @major > 0 this function will attempt to reserve a device with the given * major number and will return zero on success. * * Returns a -ve errno on failure. * * The name of this device has nothing to do with the name of the device in * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. */ int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); cdev = cdev_alloc(); if (!cdev) goto out2; cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; cd->cdev = cdev; return major ? 0 : cd->major; out: kobject_put(&cdev->kobj); out2: kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } /** * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * * This function will unregister a range of @count device numbers, * starting with @from. The caller should normally be the one who * allocated those numbers in the first place... */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } } /** * __unregister_chrdev - unregister and destroy a cdev * @major: major device number * @baseminor: first of the range of minor numbers * @count: the number of minor numbers this cdev is occupying * @name: name of this range of devices * * Unregister and destroy the cdev occupying the region described by * @major, @baseminor and @count. This function undoes what * __register_chrdev() did. */ void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct char_device_struct *cd; cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); } static DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { struct module *owner = p->owner; struct kobject *kobj; if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; } void cdev_put(struct cdev *p) { if (p) { struct module *owner = p->owner; kobject_put(&p->kobj); module_put(owner); } } /* * Called every time a character special file is opened */ static int chrdev_open(struct inode *inode, struct file *filp) { const struct file_operations *fops; struct cdev *p; struct cdev *new = NULL; int ret = 0; spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { struct kobject *kobj; int idx; spin_unlock(&cdev_lock); kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); /* Check i_cdev again in case somebody beat us to it while we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) ret = -ENXIO; } else if (!cdev_get(p)) ret = -ENXIO; spin_unlock(&cdev_lock); cdev_put(new); if (ret) return ret; ret = -ENXIO; fops = fops_get(p->ops); if (!fops) goto out_cdev_put; replace_fops(filp, fops); if (filp->f_op->open) { ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } return 0; out_cdev_put: cdev_put(p); return ret; } void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } static void cdev_purge(struct cdev *cdev) { spin_lock(&cdev_lock); while (!list_empty(&cdev->list)) { struct inode *inode; inode = container_of(cdev->list.next, struct inode, i_devices); list_del_init(&inode->i_devices); inode->i_cdev = NULL; } spin_unlock(&cdev_lock); } /* * Dummy default file-operations: the only thing this does * is contain the open that then fills in the correct operations * depending on the special file... */ const struct file_operations def_chr_fops = { .open = chrdev_open, .llseek = noop_llseek, }; static struct kobject *exact_match(dev_t dev, int *part, void *data) { struct cdev *p = data; return &p->kobj; } static int exact_lock(dev_t dev, void *data) { struct cdev *p = data; return cdev_get(p) ? 0 : -1; } /** * cdev_add() - add a char device to the system * @p: the cdev structure for the device * @dev: the first device number for which this device is responsible * @count: the number of consecutive minor numbers corresponding to this * device * * cdev_add() adds the device represented by @p to the system, making it * live immediately. A negative error code is returned on failure. */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { int error; p->dev = dev; p->count = count; if (WARN_ON(dev == WHITEOUT_DEV)) { error = -EBUSY; goto err; } error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) goto err; kobject_get(p->kobj.parent); return 0; err: kfree_const(p->kobj.name); p->kobj.name = NULL; return error; } /** * cdev_set_parent() - set the parent kobject for a char device * @p: the cdev structure * @kobj: the kobject to take a reference to * * cdev_set_parent() sets a parent kobject which will be referenced * appropriately so the parent is not freed before the cdev. This * should be called before cdev_add. */ void cdev_set_parent(struct cdev *p, struct kobject *kobj) { WARN_ON(!kobj->state_initialized); p->kobj.parent = kobj; } /** * cdev_device_add() - add a char device and it's corresponding * struct device, linkink * @dev: the device structure * @cdev: the cdev structure * * cdev_device_add() adds the char device represented by @cdev to the system, * just as cdev_add does. It then adds @dev to the system using device_add * The dev_t for the char device will be taken from the struct device which * needs to be initialized first. This helper function correctly takes a * reference to the parent device so the parent will not get released until * all references to the cdev are released. * * This helper uses dev->devt for the device number. If it is not set * it will not add the cdev and it will be equivalent to device_add. * * This function should be used whenever the struct cdev and the * struct device are members of the same structure whose lifetime is * managed by the struct device. * * NOTE: Callers must assume that userspace was able to open the cdev and * can call cdev fops callbacks at any time, even if this function fails. */ int cdev_device_add(struct cdev *cdev, struct device *dev) { int rc = 0; if (dev->devt) { cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) return rc; } rc = device_add(dev); if (rc && dev->devt) cdev_del(cdev); return rc; } /** * cdev_device_del() - inverse of cdev_device_add * @cdev: the cdev structure * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. * * If dev->devt is not set it will not remove the cdev and will be equivalent * to device_del. * * NOTE: This guarantees that associated sysfs callbacks are not running * or runnable, however any cdevs already open will remain and their fops * will still be callable even after this function returns. */ void cdev_device_del(struct cdev *cdev, struct device *dev) { device_del(dev); if (dev->devt) cdev_del(cdev); } static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); } /** * cdev_del() - remove a cdev from the system * @p: the cdev structure to be removed * * cdev_del() removes @p from the system, possibly freeing the structure * itself. * * NOTE: This guarantees that cdev device will no longer be able to be * opened, however any cdevs already open will remain and their fops will * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); kobject_put(&p->kobj); } static void cdev_default_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kobject_put(parent); } static void cdev_dynamic_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kfree(p); kobject_put(parent); } static struct kobj_type ktype_cdev_default = { .release = cdev_default_release, }; static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; /** * cdev_alloc() - allocate a cdev structure * * Allocates and returns a cdev structure, or NULL on failure. */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { INIT_LIST_HEAD(&p->list); kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } /** * cdev_init() - initialize a cdev structure * @cdev: the structure to initialize * @fops: the file_operations for this device * * Initializes @cdev, remembering @fops, making it ready to add to the * system with cdev_add(). */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } static struct kobject *base_probe(dev_t dev, int *part, void *data) { if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) /* Make old-style 2.4 aliases work */ request_module("char-major-%d", MAJOR(dev)); return NULL; } void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); } /* Let modules do char dev stuff */ EXPORT_SYMBOL(register_chrdev_region); EXPORT_SYMBOL(unregister_chrdev_region); EXPORT_SYMBOL(alloc_chrdev_region); EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_set_parent); EXPORT_SYMBOL(cdev_device_add); EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev);
2 2 1 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2017 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/openvswitch.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <net/netlink.h> #include <net/genetlink.h> #include "datapath.h" #include "meter.h" static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, }; static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; static u32 meter_hash(struct dp_meter_instance *ti, u32 id) { return id % ti->n_meters; } static void ovs_meter_free(struct dp_meter *meter) { if (!meter) return; kfree_rcu(meter, rcu); } /* Call with ovs_mutex or RCU read lock. */ static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, u32 meter_id) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter_id); struct dp_meter *meter; meter = rcu_dereference_ovsl(ti->dp_meters[hash]); if (meter && likely(meter->id == meter_id)) return meter; return NULL; } static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) { struct dp_meter_instance *ti; ti = kvzalloc(struct_size(ti, dp_meters, size), GFP_KERNEL); if (!ti) return NULL; ti->n_meters = size; return ti; } static void dp_meter_instance_free(struct dp_meter_instance *ti) { kvfree(ti); } static void dp_meter_instance_free_rcu(struct rcu_head *rcu) { struct dp_meter_instance *ti; ti = container_of(rcu, struct dp_meter_instance, rcu); kvfree(ti); } static int dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); int n_meters = min(size, ti->n_meters); struct dp_meter_instance *new_ti; int i; new_ti = dp_meter_instance_alloc(size); if (!new_ti) return -ENOMEM; for (i = 0; i < n_meters; i++) if (rcu_dereference_ovsl(ti->dp_meters[i])) new_ti->dp_meters[i] = ti->dp_meters[i]; rcu_assign_pointer(tbl->ti, new_ti); call_rcu(&ti->rcu, dp_meter_instance_free_rcu); return 0; } static void dp_meter_instance_insert(struct dp_meter_instance *ti, struct dp_meter *meter) { u32 hash; hash = meter_hash(ti, meter->id); rcu_assign_pointer(ti->dp_meters[hash], meter); } static void dp_meter_instance_remove(struct dp_meter_instance *ti, struct dp_meter *meter) { u32 hash; hash = meter_hash(ti, meter->id); RCU_INIT_POINTER(ti->dp_meters[hash], NULL); } static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter->id); int err; /* In generally, slots selected should be empty, because * OvS uses id-pool to fetch a available id. */ if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) return -EBUSY; dp_meter_instance_insert(ti, meter); /* That function is thread-safe. */ tbl->count++; if (tbl->count >= tbl->max_meters_allowed) { err = -EFBIG; goto attach_err; } if (tbl->count >= ti->n_meters && dp_meter_instance_realloc(tbl, ti->n_meters * 2)) { err = -ENOMEM; goto attach_err; } return 0; attach_err: dp_meter_instance_remove(ti, meter); tbl->count--; return err; } static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti; ASSERT_OVSL(); if (!meter) return 0; ti = rcu_dereference_ovsl(tbl->ti); dp_meter_instance_remove(ti, meter); tbl->count--; /* Shrink the meter array if necessary. */ if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && tbl->count <= (ti->n_meters / 4)) { int half_size = ti->n_meters / 2; int i; /* Avoid hash collision, don't move slots to other place. * Make sure there are no references of meters in array * which will be released. */ for (i = half_size; i < ti->n_meters; i++) if (rcu_dereference_ovsl(ti->dp_meters[i])) goto out; if (dp_meter_instance_realloc(tbl, half_size)) goto shrink_err; } out: return 0; shrink_err: dp_meter_instance_insert(ti, meter); tbl->count++; return -ENOMEM; } static struct sk_buff * ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, struct ovs_header **ovs_reply_header) { struct sk_buff *skb; struct ovs_header *ovs_header = genl_info_userhdr(info); skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); *ovs_reply_header = genlmsg_put(skb, info->snd_portid, info->snd_seq, &dp_meter_genl_family, 0, cmd); if (!*ovs_reply_header) { nlmsg_free(skb); return ERR_PTR(-EMSGSIZE); } (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; return skb; } static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, struct dp_meter *meter) { struct nlattr *nla; struct dp_meter_band *band; u16 i; if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) goto error; if (nla_put(reply, OVS_METER_ATTR_STATS, sizeof(struct ovs_flow_stats), &meter->stats)) goto error; if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, OVS_METER_ATTR_PAD)) goto error; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto error; band = meter->bands; for (i = 0; i < meter->n_bands; ++i, ++band) { struct nlattr *band_nla; band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, sizeof(struct ovs_flow_stats), &band->stats)) goto error; nla_nest_end(reply, band_nla); } nla_nest_end(reply, nla); return 0; error: return -EMSGSIZE; } static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; struct sk_buff *reply; struct datapath *dp; int err = -EMSGSIZE; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, dp->meter_tbl.max_meters_allowed)) goto exit_unlock; ovs_unlock(); if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) goto nla_put_failure; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto nla_put_failure; band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla) goto nla_put_failure; /* Currently only DROP band type is supported. */ if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) goto nla_put_failure; nla_nest_end(reply, band_nla); nla_nest_end(reply, nla); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nla_put_failure: nlmsg_free(reply); return err; } static struct dp_meter *dp_meter_create(struct nlattr **a) { struct nlattr *nla; int rem; u16 n_bands = 0; struct dp_meter *meter; struct dp_meter_band *band; int err; /* Validate attributes, count the bands. */ if (!a[OVS_METER_ATTR_BANDS]) return ERR_PTR(-EINVAL); nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) if (++n_bands > DP_MAX_BANDS) return ERR_PTR(-EINVAL); /* Allocate and set up the meter before locking anything. */ meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL_ACCOUNT); if (!meter) return ERR_PTR(-ENOMEM); meter->id = nla_get_u32(a[OVS_METER_ATTR_ID]); meter->used = div_u64(ktime_get_ns(), 1000 * 1000); meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; spin_lock_init(&meter->lock); if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { meter->stats = *(struct ovs_flow_stats *) nla_data(a[OVS_METER_ATTR_STATS]); } meter->n_bands = n_bands; /* Set up meter bands. */ band = meter->bands; nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; u32 band_max_delta_t; err = nla_parse_deprecated((struct nlattr **)&attr, OVS_BAND_ATTR_MAX, nla_data(nla), nla_len(nla), band_policy, NULL); if (err) goto exit_free_meter; if (!attr[OVS_BAND_ATTR_TYPE] || !attr[OVS_BAND_ATTR_RATE] || !attr[OVS_BAND_ATTR_BURST]) { err = -EINVAL; goto exit_free_meter; } band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); if (band->rate == 0) { err = -EINVAL; goto exit_free_meter; } band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); /* Figure out max delta_t that is enough to fill any bucket. * Keep max_delta_t size to the bucket units: * pkts => 1/1000 packets, kilobits => bits. * * Start with a full bucket. */ band->bucket = band->burst_size * 1000ULL; band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; band++; } return meter; exit_free_meter: kfree(meter); return ERR_PTR(err); } static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct dp_meter *meter, *old_meter; struct sk_buff *reply; struct ovs_header *ovs_reply_header; struct ovs_header *ovs_header = genl_info_userhdr(info); struct dp_meter_table *meter_tbl; struct datapath *dp; int err; u32 meter_id; bool failed; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; meter = dp_meter_create(a); if (IS_ERR(meter)) return PTR_ERR(meter); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, &ovs_reply_header); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto exit_free_meter; } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } meter_tbl = &dp->meter_tbl; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); old_meter = lookup_meter(meter_tbl, meter_id); err = detach_meter(meter_tbl, old_meter); if (err) goto exit_unlock; err = attach_meter(meter_tbl, meter); if (err) goto exit_free_old_meter; ovs_unlock(); /* Build response with the meter_id and stats from * the old meter, if any. */ failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); WARN_ON(failed); if (old_meter) { spin_lock_bh(&old_meter->lock); if (old_meter->keep_stats) { err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); } spin_unlock_bh(&old_meter->lock); ovs_meter_free(old_meter); } genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_free_old_meter: ovs_meter_free(old_meter); exit_unlock: ovs_unlock(); nlmsg_free(reply); exit_free_meter: kfree(meter); return err; } static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *meter; struct sk_buff *reply; struct datapath *dp; u32 meter_id; int err; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } /* Locate meter, copy stats. */ meter = lookup_meter(&dp->meter_tbl, meter_id); if (!meter) { err = -ENOENT; goto exit_unlock; } spin_lock_bh(&meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); spin_unlock_bh(&meter->lock); if (err) goto exit_unlock; ovs_unlock(); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nlmsg_free(reply); return err; } static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *old_meter; struct sk_buff *reply; struct datapath *dp; u32 meter_id; int err; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); old_meter = lookup_meter(&dp->meter_tbl, meter_id); if (old_meter) { spin_lock_bh(&old_meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); spin_unlock_bh(&old_meter->lock); err = detach_meter(&dp->meter_tbl, old_meter); if (err) goto exit_unlock; } ovs_unlock(); ovs_meter_free(old_meter); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nlmsg_free(reply); return err; } /* Meter action execution. * * Return true 'meter_id' drop band is triggered. The 'skb' should be * dropped by the caller'. */ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, u32 meter_id) { long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); long long int long_delta_ms; struct dp_meter_band *band; struct dp_meter *meter; int i, band_exceeded_max = -1; u32 band_exceeded_rate = 0; u32 delta_ms; u32 cost; meter = lookup_meter(&dp->meter_tbl, meter_id); /* Do not drop the packet when there is no meter. */ if (!meter) return false; /* Lock the meter while using it. */ spin_lock(&meter->lock); long_delta_ms = (now_ms - meter->used); /* ms */ if (long_delta_ms < 0) { /* This condition means that we have several threads fighting * for a meter lock, and the one who received the packets a * bit later wins. Assuming that all racing threads received * packets at the same time to avoid overflow. */ long_delta_ms = 0; } /* Make sure delta_ms will not be too large, so that bucket will not * wrap around below. */ delta_ms = (long_delta_ms > (long long int)meter->max_delta_t) ? meter->max_delta_t : (u32)long_delta_ms; /* Update meter statistics. */ meter->used = now_ms; meter->stats.n_packets += 1; meter->stats.n_bytes += skb->len; /* Bucket rate is either in kilobits per second, or in packets per * second. We maintain the bucket in the units of either bits or * 1/1000th of a packet, correspondingly. * Then, when rate is multiplied with milliseconds, we get the * bucket units: * msec * kbps = bits, and * msec * packets/sec = 1/1000 packets. * * 'cost' is the number of bucket units in this packet. */ cost = (meter->kbps) ? skb->len * 8 : 1000; /* Update all bands and find the one hit with the highest rate. */ for (i = 0; i < meter->n_bands; ++i) { long long int max_bucket_size; band = &meter->bands[i]; max_bucket_size = band->burst_size * 1000LL; band->bucket += delta_ms * band->rate; if (band->bucket > max_bucket_size) band->bucket = max_bucket_size; if (band->bucket >= cost) { band->bucket -= cost; } else if (band->rate > band_exceeded_rate) { band_exceeded_rate = band->rate; band_exceeded_max = i; } } if (band_exceeded_max >= 0) { /* Update band statistics. */ band = &meter->bands[band_exceeded_max]; band->stats.n_packets += 1; band->stats.n_bytes += skb->len; /* Drop band triggered, let the caller drop the 'skb'. */ if (band->type == OVS_METER_BAND_TYPE_DROP) { spin_unlock(&meter->lock); return true; } } spin_unlock(&meter->lock); return false; } static const struct genl_small_ops dp_meter_genl_ops[] = { { .cmd = OVS_METER_CMD_FEATURES, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_meter_cmd_features }, { .cmd = OVS_METER_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ .doit = ovs_meter_cmd_set, }, { .cmd = OVS_METER_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_meter_cmd_get, }, { .cmd = OVS_METER_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ .doit = ovs_meter_cmd_del }, }; static const struct genl_multicast_group ovs_meter_multicast_group = { .name = OVS_METER_MCGROUP, }; struct genl_family dp_meter_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_METER_FAMILY, .version = OVS_METER_VERSION, .maxattr = OVS_METER_ATTR_MAX, .policy = meter_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_meter_genl_ops, .n_small_ops = ARRAY_SIZE(dp_meter_genl_ops), .resv_start_op = OVS_METER_CMD_GET + 1, .mcgrps = &ovs_meter_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; int ovs_meters_init(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti; unsigned long free_mem_bytes; ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); if (!ti) return -ENOMEM; /* Allow meters in a datapath to use ~3.12% of physical memory. */ free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5); tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter), DP_METER_NUM_MAX); if (!tbl->max_meters_allowed) goto out_err; rcu_assign_pointer(tbl->ti, ti); tbl->count = 0; return 0; out_err: dp_meter_instance_free(ti); return -ENOMEM; } void ovs_meters_exit(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti); int i; for (i = 0; i < ti->n_meters; i++) ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i])); dp_meter_instance_free(ti); }
3 3 3 2 5 8 159 145 17 5 152 3 5 151 3 17 145 3 2 3 3 2 3 3 3 154 14 2 3 140 2 156 154 156 156 154 147 4 4 4 4 9 2 2 2 2 1 1 1 1 1 1 9 5 1 4 2 9 9 2 17 7 1 11 1 16 1 2 14 12 5 4 7 2 4 9 3 3 6 4 1 2 3 4 2 1 2 2 2 3 3 1 1 4 4 3 3 4 4 4 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/igmp.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/if_ether.h> #include <net/ip.h> #include <net/netlink.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/addrconf.h> #endif #include "br_private.h" static bool br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { *timer = br_timer_value(&pmctx->ip4_mc_router_timer); return !hlist_unhashed(&pmctx->ip4_rlist); } static bool br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { #if IS_ENABLED(CONFIG_IPV6) *timer = br_timer_value(&pmctx->ip6_mc_router_timer); return !hlist_unhashed(&pmctx->ip6_rlist); #else *timer = 0; return false; #endif } static size_t __br_rports_one_size(void) { return nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PORT */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_TIMER */ nla_total_size(sizeof(u8)) + /* MDBA_ROUTER_PATTR_TYPE */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET_TIMER */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET6_TIMER */ nla_total_size(sizeof(u32)); /* MDBA_ROUTER_PATTR_VID */ } size_t br_rports_size(const struct net_bridge_mcast *brmctx) { struct net_bridge_mcast_port *pmctx; size_t size = nla_total_size(0); /* MDBA_ROUTER */ rcu_read_lock(); hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) size += __br_rports_one_size(); #if IS_ENABLED(CONFIG_IPV6) hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) size += __br_rports_one_size(); #endif rcu_read_unlock(); return size; } int br_rports_fill_info(struct sk_buff *skb, const struct net_bridge_mcast *brmctx) { u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0; bool have_ip4_mc_rtr, have_ip6_mc_rtr; unsigned long ip4_timer, ip6_timer; struct nlattr *nest, *port_nest; struct net_bridge_port *p; if (!brmctx->multicast_router || !br_rports_have_mc_router(brmctx)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; list_for_each_entry_rcu(p, &brmctx->br->port_list, list) { struct net_bridge_mcast_port *pmctx; if (vid) { struct net_bridge_vlan *v; v = br_vlan_find(nbp_vlan_group(p), vid); if (!v) continue; pmctx = &v->port_mcast_ctx; } else { pmctx = &p->multicast_ctx; } have_ip4_mc_rtr = br_ip4_rports_get_timer(pmctx, &ip4_timer); have_ip6_mc_rtr = br_ip6_rports_get_timer(pmctx, &ip6_timer); if (!have_ip4_mc_rtr && !have_ip6_mc_rtr) continue; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto fail; if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, p->multicast_ctx.multicast_router) || (have_ip4_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, ip4_timer)) || (have_ip6_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER, ip6_timer)) || (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) { nla_nest_cancel(skb, port_nest); goto fail; } nla_nest_end(skb, port_nest); } nla_nest_end(skb, nest); return 0; fail: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) { e->state = flags & MDB_PG_FLAGS_PERMANENT; e->flags = 0; if (flags & MDB_PG_FLAGS_OFFLOAD) e->flags |= MDB_FLAGS_OFFLOAD; if (flags & MDB_PG_FLAGS_FAST_LEAVE) e->flags |= MDB_FLAGS_FAST_LEAVE; if (flags & MDB_PG_FLAGS_STAR_EXCL) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED) e->flags |= MDB_FLAGS_OFFLOAD_FAILED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, struct nlattr **mdb_attrs) { memset(ip, 0, sizeof(struct br_ip)); ip->vid = entry->vid; ip->proto = entry->addr.proto; switch (ip->proto) { case htons(ETH_P_IP): ip->dst.ip4 = entry->addr.u.ip4; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip->dst.ip6 = entry->addr.u.ip6; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #endif default: ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr); } } static int __mdb_fill_srcs(struct sk_buff *skb, struct net_bridge_port_group *p) { struct net_bridge_group_src *ent; struct nlattr *nest, *nest_ent; if (hlist_empty(&p->src_list)) return 0; nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST); if (!nest) return -EMSGSIZE; hlist_for_each_entry_rcu(ent, &p->src_list, node, lockdep_is_held(&p->key.port->br->multicast_lock)) { nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); if (!nest_ent) goto out_cancel_err; switch (ent->addr.proto) { case htons(ETH_P_IP): if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, ent->addr.src.ip4)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, &ent->addr.src.ip6)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #endif default: nla_nest_cancel(skb, nest_ent); continue; } if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, br_timer_value(&ent->timer))) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest); return 0; out_cancel_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int __mdb_fill_info(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *p) { bool dump_srcs_mode = false; struct timer_list *mtimer; struct nlattr *nest_ent; struct br_mdb_entry e; u8 flags = 0; int ifindex; memset(&e, 0, sizeof(e)); if (p) { ifindex = p->key.port->dev->ifindex; mtimer = &p->timer; flags = p->flags; } else { ifindex = mp->br->dev->ifindex; mtimer = &mp->timer; } __mdb_entry_fill_flags(&e, flags); e.ifindex = ifindex; e.vid = mp->addr.vid; if (mp->addr.proto == htons(ETH_P_IP)) { e.addr.u.ip4 = mp->addr.dst.ip4; #if IS_ENABLED(CONFIG_IPV6) } else if (mp->addr.proto == htons(ETH_P_IPV6)) { e.addr.u.ip6 = mp->addr.dst.ip6; #endif } else { ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); e.state = MDB_PERMANENT; } e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO); if (!nest_ent) return -EMSGSIZE; if (nla_put_nohdr(skb, sizeof(e), &e) || nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, br_timer_value(mtimer))) goto nest_err; switch (mp->addr.proto) { case htons(ETH_P_IP): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3); if (mp->addr.src.ip4) { if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, mp->addr.src.ip4)) goto nest_err; break; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2); if (!ipv6_addr_any(&mp->addr.src.ip6)) { if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, &mp->addr.src.ip6)) goto nest_err; break; } break; #endif default: ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); } if (p) { if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) goto nest_err; if (dump_srcs_mode && (__mdb_fill_srcs(skb, p) || nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) goto nest_err; } nla_nest_end(skb, nest_ent); return 0; nest_err: nla_nest_cancel(skb, nest_ent); return -EMSGSIZE; } static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2]; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) return -EMSGSIZE; hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; if (idx < s_idx) goto skip; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!nest2) { err = -EMSGSIZE; break; } if (!s_pidx && mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) { nla_nest_cancel(skb, nest2); break; } } for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { if (!p->key.port) continue; if (pidx < s_pidx) goto skip_pg; err = __mdb_fill_info(skb, mp, p); if (err) { nla_nest_end(skb, nest2); goto out; } skip_pg: pidx++; } pidx = 0; s_pidx = 0; nla_nest_end(skb, nest2); skip: idx++; } out: cb->args[1] = idx; cb->args[2] = pidx; nla_nest_end(skb, nest); return err; } int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct net_bridge *br = netdev_priv(dev); struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_GETMDB, sizeof(*bpm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->ifindex = dev->ifindex; rcu_read_lock(); err = br_mdb_fill_info(skb, cb, dev); if (err) goto out; err = br_rports_fill_info(skb, &br->multicast_ctx); if (err) goto out; out: rcu_read_unlock(); nlmsg_end(skb, nlh); return err; } static int nlmsg_populate_mdb_fill(struct sk_buff *skb, struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct nlmsghdr *nlh; struct br_port_msg *bpm; struct nlattr *nest, *nest2; nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) goto cancel; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (nest2 == NULL) goto end; if (__mdb_fill_info(skb, mp, pg)) goto end; nla_nest_end(skb, nest2); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t rtnl_mdb_nlmsg_pg_size(const struct net_bridge_port_group *pg) { struct net_bridge_group_src *ent; size_t nlmsg_size, addr_size = 0; /* MDBA_MDB_ENTRY_INFO */ nlmsg_size = nla_total_size(sizeof(struct br_mdb_entry)) + /* MDBA_MDB_EATTR_TIMER */ nla_total_size(sizeof(u32)); if (!pg) goto out; /* MDBA_MDB_EATTR_RTPROT */ nlmsg_size += nla_total_size(sizeof(u8)); switch (pg->key.addr.proto) { case htons(ETH_P_IP): /* MDBA_MDB_EATTR_SOURCE */ if (pg->key.addr.src.ip4) nlmsg_size += nla_total_size(sizeof(__be32)); if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): /* MDBA_MDB_EATTR_SOURCE */ if (!ipv6_addr_any(&pg->key.addr.src.ip6)) nlmsg_size += nla_total_size(sizeof(struct in6_addr)); if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); break; #endif } /* MDBA_MDB_EATTR_GROUP_MODE */ nlmsg_size += nla_total_size(sizeof(u8)); /* MDBA_MDB_EATTR_SRC_LIST nested attr */ if (!hlist_empty(&pg->src_list)) nlmsg_size += nla_total_size(0); hlist_for_each_entry(ent, &pg->src_list, node) { /* MDBA_MDB_SRCLIST_ENTRY nested attr + * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER */ nlmsg_size += nla_total_size(0) + nla_total_size(addr_size) + nla_total_size(sizeof(u32)); } out: return nlmsg_size; } static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0) + /* Port group entry */ rtnl_mdb_nlmsg_pg_size(pg); } static void __br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type, bool notify_switchdev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; if (notify_switchdev) br_switchdev_mdb_notify(dev, mp, pg, type); skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { __br_mdb_notify(dev, mp, pg, type, true); } void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg) { __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false); } static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u16 vid, u32 pid, u32 seq, int type, unsigned int flags) { struct nlattr *nest, *port_nest; struct br_port_msg *bpm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (!nest) goto cancel; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto end; if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) { nla_nest_cancel(skb, port_nest); goto end; } if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) { nla_nest_cancel(skb, port_nest); goto end; } nla_nest_end(skb, port_nest); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_rtr_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + nla_total_size(sizeof(__u32)) + nla_total_size(sizeof(u16)); } void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; int ifindex; u16 vid; ifindex = pmctx ? pmctx->port->dev->ifindex : 0; vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid : 0; skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type, NTF_SELF); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } static const struct nla_policy br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = { [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static const struct nla_policy br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = { [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol), }; static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, MCAST_INCLUDE), [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol), [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), }; static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, struct netlink_ext_ack *extack) { switch (proto) { case htons(ETH_P_IP): if (nla_len(attr) != sizeof(struct in_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length"); return false; } if (ipv4_is_multicast(nla_get_in_addr(attr))) { NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed"); return false; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct in6_addr src; if (nla_len(attr) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length"); return false; } src = nla_get_in6_addr(attr); if (ipv6_addr_is_multicast(&src)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed"); return false; } break; } #endif default: NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address"); return false; } return true; } static struct net_bridge_mcast * __br_mdb_choose_context(struct net_bridge *br, const struct br_mdb_entry *entry, struct netlink_ext_ack *extack) { struct net_bridge_mcast *brmctx = NULL; struct net_bridge_vlan *v; if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { brmctx = &br->multicast_ctx; goto out; } if (!entry->vid) { NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled"); goto out; } v = br_vlan_find(br_vlan_group(br), entry->vid); if (!v) { NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured"); goto out; } if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) { NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled"); goto out; } brmctx = &v->br_mcast_ctx; out: return brmctx; } static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, unsigned char flags) { unsigned long now = jiffies; pg->flags = flags; pg->rt_protocol = cfg->rt_protocol; if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else timer_delete(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); return 0; } static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; unsigned long now = jiffies; for (pp = &mp->ports; (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port"); return -EEXIST; } return br_mdb_replace_group_sg(cfg, mp, p, brmctx, flags); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, MCAST_INCLUDE, cfg->rt_protocol, extack); if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); /* All of (*, G) EXCLUDE ports need to be added to the new (S, G) for * proper replication. */ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) { struct net_bridge_mdb_entry *star_mp; struct br_ip star_group; star_group = p->key.addr; memset(&star_group.src, 0, sizeof(star_group.src)); star_mp = br_mdb_ip_get(cfg->br, &star_group); if (star_mp) br_multicast_sg_add_exclude_ports(star_mp, p); } return 0; } static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg, struct br_ip *src_ip, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *sgmp; struct br_mdb_config sg_cfg; struct br_ip sg_ip; u8 flags = 0; sg_ip = cfg->group; sg_ip.src = src_ip->src; sgmp = br_multicast_new_group(cfg->br, &sg_ip); if (IS_ERR(sgmp)) { NL_SET_ERR_MSG_MOD(extack, "Failed to add (S, G) MDB entry"); return PTR_ERR(sgmp); } if (cfg->entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; if (cfg->filter_mode == MCAST_EXCLUDE) flags |= MDB_PG_FLAGS_BLOCKED; memset(&sg_cfg, 0, sizeof(sg_cfg)); sg_cfg.br = cfg->br; sg_cfg.p = cfg->p; sg_cfg.entry = cfg->entry; sg_cfg.group = sg_ip; sg_cfg.src_entry = true; sg_cfg.filter_mode = MCAST_INCLUDE; sg_cfg.rt_protocol = cfg->rt_protocol; sg_cfg.nlflags = cfg->nlflags; return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack); } static int br_mdb_add_group_src(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct br_mdb_src_entry *src, struct netlink_ext_ack *extack) { struct net_bridge_group_src *ent; unsigned long now = jiffies; int err; ent = br_multicast_find_group_src(pg, &src->addr); if (!ent) { ent = br_multicast_new_group_src(pg, &src->addr); if (!ent) { NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry"); return -ENOSPC; } } else if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Source entry already exists"); return -EEXIST; } if (cfg->filter_mode == MCAST_INCLUDE && cfg->entry->state == MDB_TEMPORARY) mod_timer(&ent->timer, now + br_multicast_gmi(brmctx)); else timer_delete(&ent->timer); /* Install a (S, G) forwarding entry for the source. */ err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack); if (err) goto err_del_sg; ent->flags = BR_SGRP_F_INSTALLED | BR_SGRP_F_USER_ADDED; return 0; err_del_sg: __br_multicast_del_group_src(ent); return err; } static void br_mdb_del_group_src(struct net_bridge_port_group *pg, struct br_mdb_src_entry *src) { struct net_bridge_group_src *ent; ent = br_multicast_find_group_src(pg, &src->addr); if (WARN_ON_ONCE(!ent)) return; br_multicast_del_group_src(ent, false); } static int br_mdb_add_group_srcs(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { int i, err; for (i = 0; i < cfg->num_src_entries; i++) { err = br_mdb_add_group_src(cfg, pg, brmctx, &cfg->src_entries[i], extack); if (err) goto err_del_group_srcs; } return 0; err_del_group_srcs: for (i--; i >= 0; i--) br_mdb_del_group_src(pg, &cfg->src_entries[i]); return err; } static int br_mdb_replace_group_srcs(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { struct net_bridge_group_src *ent; struct hlist_node *tmp; int err; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; err = br_mdb_add_group_srcs(cfg, pg, brmctx, extack); if (err) goto err_clear_delete; hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_DELETE) br_multicast_del_group_src(ent, false); } return 0; err_clear_delete: hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_DELETE; return err; } static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { unsigned long now = jiffies; int err; err = br_mdb_replace_group_srcs(cfg, pg, brmctx, extack); if (err) return err; pg->flags = flags; pg->filter_mode = cfg->filter_mode; pg->rt_protocol = cfg->rt_protocol; if (!(flags & MDB_PG_FLAGS_PERMANENT) && cfg->filter_mode == MCAST_EXCLUDE) mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else timer_delete(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) br_multicast_star_g_handle_mode(pg, cfg->filter_mode); return 0; } static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; unsigned long now = jiffies; int err; for (pp = &mp->ports; (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port"); return -EEXIST; } return br_mdb_replace_group_star_g(cfg, mp, p, brmctx, flags, extack); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, cfg->filter_mode, cfg->rt_protocol, extack); if (unlikely(!p)) return -ENOMEM; err = br_mdb_add_group_srcs(cfg, p, brmctx, extack); if (err) goto err_del_port_group; rcu_assign_pointer(*pp, p); if (!(flags & MDB_PG_FLAGS_PERMANENT) && cfg->filter_mode == MCAST_EXCLUDE) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); /* If we are adding a new EXCLUDE port group (*, G), it needs to be * also added to all (S, G) entries for proper replication. */ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto) && cfg->filter_mode == MCAST_EXCLUDE) br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); return 0; err_del_port_group: br_multicast_del_port_group(p); return err; } static int br_mdb_add_group(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = cfg->entry; struct net_bridge_port *port = cfg->p; struct net_bridge_mdb_entry *mp; struct net_bridge *br = cfg->br; struct net_bridge_mcast *brmctx; struct br_ip group = cfg->group; unsigned char flags = 0; brmctx = __br_mdb_choose_context(br, entry, extack); if (!brmctx) return -EINVAL; mp = br_multicast_new_group(br, &group); if (IS_ERR(mp)) return PTR_ERR(mp); /* host join */ if (!port) { if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } br_multicast_host_join(brmctx, mp, false); br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); return 0; } if (entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; if (br_multicast_is_star_g(&group)) return br_mdb_add_group_star_g(cfg, mp, brmctx, flags, extack); else return br_mdb_add_group_sg(cfg, mp, brmctx, flags, extack); } static int __br_mdb_add(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { int ret; spin_lock_bh(&cfg->br->multicast_lock); ret = br_mdb_add_group(cfg, extack); spin_unlock_bh(&cfg->br->multicast_lock); return ret; } static int br_mdb_config_src_entry_init(struct nlattr *src_entry, struct br_mdb_src_entry *src, __be16 proto, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBE_SRCATTR_MAX + 1]; int err; err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry, br_mdbe_src_list_entry_pol, extack); if (err) return err; if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS)) return -EINVAL; if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack)) return -EINVAL; src->addr.proto = proto; nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS], nla_len(tb[MDBE_SRCATTR_ADDRESS])); return 0; } static int br_mdb_config_src_list_init(struct nlattr *src_list, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *src_entry; int rem, err; int i = 0; nla_for_each_nested(src_entry, src_list, rem) cfg->num_src_entries++; if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) { NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)", PG_SRC_ENT_LIMIT - 1); return -EINVAL; } cfg->src_entries = kcalloc(cfg->num_src_entries, sizeof(struct br_mdb_src_entry), GFP_KERNEL); if (!cfg->src_entries) return -ENOMEM; nla_for_each_nested(src_entry, src_list, rem) { err = br_mdb_config_src_entry_init(src_entry, &cfg->src_entries[i], cfg->entry->addr.proto, extack); if (err) goto err_src_entry_init; i++; } return 0; err_src_entry_init: kfree(cfg->src_entries); return err; } static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg) { kfree(cfg->src_entries); } static int br_mdb_config_attrs_init(struct nlattr *set_attrs, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; int err; err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, set_attrs, br_mdbe_attrs_pol, extack); if (err) return err; if (mdb_attrs[MDBE_ATTR_SOURCE] && !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], cfg->entry->addr.proto, extack)) return -EINVAL; __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs); if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups"); return -EINVAL; } if (!br_multicast_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries"); return -EINVAL; } cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]); } else { cfg->filter_mode = MCAST_EXCLUDE; } if (mdb_attrs[MDBE_ATTR_SRC_LIST]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups"); return -EINVAL; } if (!br_multicast_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries"); return -EINVAL; } if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) { NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode"); return -EINVAL; } err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST], cfg, extack); if (err) return err; } if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) { NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list"); return -EINVAL; } if (mdb_attrs[MDBE_ATTR_RTPROT]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups"); return -EINVAL; } cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]); } return 0; } static int br_mdb_config_init(struct br_mdb_config *cfg, struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); memset(cfg, 0, sizeof(*cfg)); cfg->filter_mode = MCAST_EXCLUDE; cfg->rt_protocol = RTPROT_STATIC; cfg->nlflags = nlmsg_flags; cfg->br = netdev_priv(dev); if (!netif_running(cfg->br->dev)) { NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running"); return -EINVAL; } if (!br_opt_get(cfg->br, BROPT_MULTICAST_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled"); return -EINVAL; } cfg->entry = nla_data(tb[MDBA_SET_ENTRY]); if (cfg->entry->ifindex != cfg->br->dev->ifindex) { struct net_device *pdev; pdev = __dev_get_by_index(net, cfg->entry->ifindex); if (!pdev) { NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist"); return -ENODEV; } cfg->p = br_port_get_rtnl(pdev); if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port"); return -EINVAL; } if (cfg->p->br != cfg->br) { NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); return -EINVAL; } } if (cfg->entry->addr.proto == htons(ETH_P_IP) && ipv4_is_zeronet(cfg->entry->addr.u.ip4)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address 0.0.0.0 is not allowed"); return -EINVAL; } if (tb[MDBA_SET_ENTRY_ATTRS]) return br_mdb_config_attrs_init(tb[MDBA_SET_ENTRY_ATTRS], cfg, extack); else __mdb_entry_to_br_ip(cfg->entry, &cfg->group, NULL); return 0; } static void br_mdb_config_fini(struct br_mdb_config *cfg) { br_mdb_config_src_list_fini(cfg); } int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct br_mdb_config cfg; int err; err = br_mdb_config_init(&cfg, dev, tb, nlmsg_flags, extack); if (err) return err; err = -EINVAL; /* host join errors which can happen before creating the group */ if (!cfg.p && !br_group_is_l2(&cfg.group)) { /* don't allow any flags for host-joined IP groups */ if (cfg.entry->state) { NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); goto out; } if (!br_multicast_is_star_g(&cfg.group)) { NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); goto out; } } if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); goto out; } if (cfg.p) { if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); goto out; } vg = nbp_vlan_group(cfg.p); } else { vg = br_vlan_group(cfg.br); } /* If vlan filtering is enabled and VLAN is not specified * install mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; err = __br_mdb_add(&cfg, extack); if (err) break; } } else { err = __br_mdb_add(&cfg, extack); } out: br_mdb_config_fini(&cfg); return err; } static int __br_mdb_del(const struct br_mdb_config *cfg) { struct br_mdb_entry *entry = cfg->entry; struct net_bridge *br = cfg->br; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip ip = cfg->group; int err = -EINVAL; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &ip); if (!mp) goto unlock; /* host leave */ if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) { br_multicast_host_leave(mp, false); err = 0; br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); goto unlock; } for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (!p->key.port || p->key.port->dev->ifindex != entry->ifindex) continue; br_multicast_del_pg(mp, p, pp); err = 0; break; } unlock: spin_unlock_bh(&br->multicast_lock); return err; } int br_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct br_mdb_config cfg; int err; err = br_mdb_config_init(&cfg, dev, tb, 0, extack); if (err) return err; if (cfg.p) vg = nbp_vlan_group(cfg.p); else vg = br_vlan_group(cfg.br); /* If vlan filtering is enabled and VLAN is not specified * delete mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; err = __br_mdb_del(&cfg); } } else { err = __br_mdb_del(&cfg); } br_mdb_config_fini(&cfg); return err; } struct br_mdb_flush_desc { u32 port_ifindex; u16 vid; u8 rt_protocol; u8 state; u8 state_mask; }; static const struct nla_policy br_mdbe_attrs_del_bulk_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), [MDBE_ATTR_STATE_MASK] = NLA_POLICY_MASK(NLA_U8, MDB_PERMANENT), }; static int br_mdb_flush_desc_init(struct br_mdb_flush_desc *desc, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; desc->port_ifindex = entry->ifindex; desc->vid = entry->vid; desc->state = entry->state; if (!tb[MDBA_SET_ENTRY_ATTRS]) return 0; err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_SET_ENTRY_ATTRS], br_mdbe_attrs_del_bulk_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_STATE_MASK]) desc->state_mask = nla_get_u8(mdbe_attrs[MDBE_ATTR_STATE_MASK]); if (mdbe_attrs[MDBE_ATTR_RTPROT]) desc->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]); return 0; } static void br_mdb_flush_host(struct net_bridge *br, struct net_bridge_mdb_entry *mp, const struct br_mdb_flush_desc *desc) { u8 state; if (desc->port_ifindex && desc->port_ifindex != br->dev->ifindex) return; if (desc->rt_protocol) return; state = br_group_is_l2(&mp->addr) ? MDB_PERMANENT : 0; if (desc->state_mask && (state & desc->state_mask) != desc->state) return; br_multicast_host_leave(mp, true); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } static void br_mdb_flush_pgs(struct net_bridge *br, struct net_bridge_mdb_entry *mp, const struct br_mdb_flush_desc *desc) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;) { u8 state; if (desc->port_ifindex && desc->port_ifindex != p->key.port->dev->ifindex) { pp = &p->next; continue; } if (desc->rt_protocol && desc->rt_protocol != p->rt_protocol) { pp = &p->next; continue; } state = p->flags & MDB_PG_FLAGS_PERMANENT ? MDB_PERMANENT : 0; if (desc->state_mask && (state & desc->state_mask) != desc->state) { pp = &p->next; continue; } br_multicast_del_pg(mp, p, pp); } } static void br_mdb_flush(struct net_bridge *br, const struct br_mdb_flush_desc *desc) { struct net_bridge_mdb_entry *mp; spin_lock_bh(&br->multicast_lock); /* Safe variant is not needed because entries are removed from the list * upon group timer expiration or bridge deletion. */ hlist_for_each_entry(mp, &br->mdb_list, mdb_node) { if (desc->vid && desc->vid != mp->addr.vid) continue; br_mdb_flush_host(br, mp, desc); br_mdb_flush_pgs(br, mp, desc); } spin_unlock_bh(&br->multicast_lock); } int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct br_mdb_flush_desc desc = {}; int err; err = br_mdb_flush_desc_init(&desc, tb, extack); if (err) return err; br_mdb_flush(br, &desc); return 0; } static const struct nla_policy br_mdbe_attrs_get_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static int br_mdb_get_parse(struct net_device *dev, struct nlattr *tb[], struct br_ip *group, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_GET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; if (!tb[MDBA_GET_ENTRY_ATTRS]) { __mdb_entry_to_br_ip(entry, group, NULL); return 0; } err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_GET_ENTRY_ATTRS], br_mdbe_attrs_get_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_SOURCE] && !is_valid_mdb_source(mdbe_attrs[MDBE_ATTR_SOURCE], entry->addr.proto, extack)) return -EINVAL; __mdb_entry_to_br_ip(entry, group, mdbe_attrs); return 0; } static struct sk_buff * br_mdb_get_reply_alloc(const struct net_bridge_mdb_entry *mp) { struct net_bridge_port_group *pg; size_t nlmsg_size; nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0); if (mp->host_joined) nlmsg_size += rtnl_mdb_nlmsg_pg_size(NULL); for (pg = mlock_dereference(mp->ports, mp->br); pg; pg = mlock_dereference(pg->next, mp->br)) nlmsg_size += rtnl_mdb_nlmsg_pg_size(pg); return nlmsg_new(nlmsg_size, GFP_ATOMIC); } static int br_mdb_get_reply_fill(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, u32 portid, u32 seq) { struct nlattr *mdb_nest, *mdb_entry_nest; struct net_bridge_port_group *pg; struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, portid, seq, RTM_NEWMDB, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = mp->br->dev->ifindex; mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB); if (!mdb_nest) { err = -EMSGSIZE; goto cancel; } mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!mdb_entry_nest) { err = -EMSGSIZE; goto cancel; } if (mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) goto cancel; } for (pg = mlock_dereference(mp->ports, mp->br); pg; pg = mlock_dereference(pg->next, mp->br)) { err = __mdb_fill_info(skb, mp, pg); if (err) goto cancel; } nla_nest_end(skb, mdb_entry_nest); nla_nest_end(skb, mdb_nest); nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return err; } int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct sk_buff *skb; struct br_ip group; int err; err = br_mdb_get_parse(dev, tb, &group, extack); if (err) return err; /* Hold the multicast lock to ensure that the MDB entry does not change * between the time the reply size is determined and when the reply is * filled in. */ spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &group); if (!mp || (!mp->ports && !mp->host_joined)) { NL_SET_ERR_MSG_MOD(extack, "MDB entry not found"); err = -ENOENT; goto unlock; } skb = br_mdb_get_reply_alloc(mp); if (!skb) { err = -ENOMEM; goto unlock; } err = br_mdb_get_reply_fill(skb, mp, portid, seq); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to fill MDB get reply"); goto free; } spin_unlock_bh(&br->multicast_lock); return rtnl_unicast(skb, dev_net(dev), portid); free: kfree_skb(skb); unlock: spin_unlock_bh(&br->multicast_lock); return err; }
2 103 2639 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETLINK_H #define __LINUX_NETLINK_H #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/scm.h> #include <uapi/linux/netlink.h> struct net; void do_trace_netlink_extack(const char *msg); static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) { return (struct nlmsghdr *)skb->data; } enum netlink_skb_flags { NETLINK_SKB_DST = 0x8, /* Dst set in sendto or sendmsg */ }; struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; __u32 flags; struct sock *sk; bool nsid_is_set; int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) #define NETLINK_CTX_SIZE 48 void netlink_table_grab(void); void netlink_table_ungrab(void); #define NL_CFG_F_NONROOT_RECV (1 << 0) #define NL_CFG_F_NONROOT_SEND (1 << 1) /* optional Netlink kernel configuration parameters */ struct netlink_kernel_cfg { unsigned int groups; unsigned int flags; void (*input)(struct sk_buff *skb); int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release) (struct sock *sk, unsigned long *groups); }; struct sock *__netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg); static inline struct sock * netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) { return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 8 #define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute * @miss_type: attribute type which was missing * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length * @_msg_buf: output buffer for formatted message strings - don't access * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; const struct nlattr *miss_nest; u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) \ __extack->_msg = __msg; \ } while (0) /* We splice fmt with %s at each end even in the snprintf so that both calls * can use the same string constant, avoiding its duplication in .ro */ #define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ } while (0) #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) #define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) #define NL_SET_ERR_MSG_WEAK(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG((extack), msg); \ } while (0) #define NL_SET_ERR_MSG_WEAK_MOD(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG_MOD((extack), msg); \ } while (0) #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ (extack)->policy = (pol); \ } \ } while (0) #define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) #define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) { \ __extack->_msg = __msg; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } \ } while (0) #define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } while (0) #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) #define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \ NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args) #define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (__extack) { \ __extack->miss_nest = (nest); \ __extack->miss_type = (type); \ } \ } while (0) #define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ struct nlattr **__tb = (tb); \ u32 __attr = (type); \ int __retval; \ \ __retval = !__tb[__attr]; \ if (__retval) \ NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ __retval; \ }) static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { if (!extack) return; BUILD_BUG_ON(sizeof(extack->cookie) < sizeof(cookie)); memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack); int netlink_has_listeners(struct sock *sk, unsigned int group); bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfd(int fd); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb); static inline struct sk_buff * netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *nskb; nskb = skb_clone(skb, gfp_mask); if (!nskb) return NULL; /* This is a large skb, set destructor callback to release head */ if (is_vmalloc_addr(skb->head)) nskb->destructor = skb->destructor; return nskb; } /* * skb should fit one page. This choice is good for headerless malloc. * But we should limit to 8K so that userspace does not have to * use enormous buffer sizes on recvmsg() calls just to avoid * MSG_TRUNC when PAGE_SIZE is very large. */ #if PAGE_SIZE < 8192UL #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(PAGE_SIZE) #else #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(8192UL) #endif #define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); void *data; /* the module that dump function belong to */ struct module *module; struct netlink_ext_ack *extack; u16 family; u16 answer_flags; u32 min_dump_alloc; unsigned int prev_seq, seq; int flags; bool strict_check; union { u8 ctx[NETLINK_CTX_SIZE]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. */ long args[6]; }; }; #define NL_ASSERT_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) struct netlink_notify { struct net *net; u32 portid; int protocol; }; struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); struct netlink_dump_control { int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*done)(struct netlink_callback *); struct netlink_ext_ack *extack; void *data; struct module *module; u32 min_dump_alloc; int flags; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control); static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { if (!control->module) control->module = THIS_MODULE; return __netlink_dump_start(ssk, skb, nlh, control); } struct netlink_tap { struct net_device *dev; struct module *module; struct list_head list; }; int netlink_add_tap(struct netlink_tap *nt); int netlink_remove_tap(struct netlink_tap *nt); bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *ns, int cap); bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */
237 425 1082 1 425 74 341 74 341 341 342 342 341 342 74 269 341 342 74 74 74 74 74 74 74 74 74 74 726 725 4990 4443 637 2 2 2 2 2 2 4991 4988 4988 1 357 637 637 4094 4990 4987 2 992 4091 4992 355 355 355 309 6 314 315 145 170 41 273 314 315 315 314 315 315 315 2 2 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 38 39 39 236 4870 4868 4871 4636 4638 4640 39 39 39 39 39 39 4793 4792 4794 4793 4789 4622 4626 2061 86 4622 4174 638 4623 107 107 106 107 129 129 129 129 725 849 848 849 847 4793 19 612 1 236 847 40 848 849 848 848 848 848 848 1 1 1 1 1 1 848 84 84 578 579 578 578 1603 1604 1604 1603 1186 1266 446 846 4 342 342 342 342 342 4 4 4 4 3 1 103 133 425 425 425 425 236 236 4565 2298 4017 3984 39 4017 4018 4016 4014 4019 4019 4016 4018 1875 4014 39 39 4014 4017 4015 4013 1031 1029 1031 225 11 1031 225 1030 1031 613 425 1032 5490 2580 1032 4016 1032 1942 1704 4549 4641 4635 4639 4669 29 4665 4669 4674 29 4637 4645 2 4637 4643 4640 4639 4640 4638 4621 1735 99 99 99 99 99 99 99 84 84 84 84 84 84 84 84 84 84 84 84 84 99 98 99 84 84 84 84 84 84 84 84 84 84 74 74 99 99 26 74 99 99 99 99 99 84 84 84 84 84 84 84 84 84 84 84 99 99 84 83 84 84 84 84 84 84 84 84 84 99 99 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 99 99 99 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 84 99 99 99 99 99 99 74 74 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 // SPDX-License-Identifier: GPL-2.0 /* * Block multiqueue core code * * Copyright (C) 2013-2014 Jens Axboe * Copyright (C) 2013-2014 Christoph Hellwig */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/llist.h> #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/topology.h> #include <linux/sched/signal.h> #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> #include <linux/blk-crypto.h> #include <linux/part_stat.h> #include <linux/sched/isolation.h> #include <trace/events/block.h> #include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); static DEFINE_MUTEX(blk_mq_cpuhp_lock); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags); static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags); /* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue. */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { return !list_empty_careful(&hctx->dispatch) || sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; if (!sbitmap_test_bit(&hctx->ctx_map, bit)) sbitmap_set_bit(&hctx->ctx_map, bit); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; sbitmap_clear_bit(&hctx->ctx_map, bit); } struct mq_inflight { struct block_device *part; unsigned int inflight[2]; }; static bool blk_mq_check_in_driver(struct request *rq, void *priv) { struct mq_inflight *mi = priv; if (rq->rq_flags & RQF_IO_STAT && (!bdev_is_partition(mi->part) || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; } void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver, &mi); inflight[READ] = mi.inflight[READ]; inflight[WRITE] = mi.inflight[WRITE]; } #ifdef CONFIG_LOCKDEP static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) { if (!owner) return false; if (!q->mq_freeze_depth) { q->mq_freeze_owner = owner; q->mq_freeze_owner_depth = 1; q->mq_freeze_disk_dead = !q->disk || test_bit(GD_DEAD, &q->disk->state) || !blk_queue_registered(q); q->mq_freeze_queue_dying = blk_queue_dying(q); return true; } if (owner == q->mq_freeze_owner) q->mq_freeze_owner_depth += 1; return false; } /* verify the last unfreeze in owner context */ static bool blk_unfreeze_check_owner(struct request_queue *q) { if (q->mq_freeze_owner != current) return false; if (--q->mq_freeze_owner_depth == 0) { q->mq_freeze_owner = NULL; return true; } return false; } #else static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) { return false; } static bool blk_unfreeze_check_owner(struct request_queue *q) { return false; } #endif bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner) { bool freeze; mutex_lock(&q->mq_freeze_lock); freeze = blk_freeze_set_owner(q, owner); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); } else { mutex_unlock(&q->mq_freeze_lock); } return freeze; } void blk_freeze_queue_start(struct request_queue *q) { if (__blk_freeze_queue_start(q, current)) blk_freeze_acquire_lock(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout) { return wait_event_timeout(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter), timeout); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); void blk_mq_freeze_queue_nomemsave(struct request_queue *q) { blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave); bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { bool unfreeze; mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } unfreeze = blk_unfreeze_check_owner(q); mutex_unlock(&q->mq_freeze_lock); return unfreeze; } void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q) { if (__blk_mq_unfreeze_queue(q, false)) blk_unfreeze_release_lock(q); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore); /* * non_owner variant of blk_freeze_queue_start * * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen * by the same task. This is fragile and should not be used if at all * possible. */ void blk_freeze_queue_start_non_owner(struct request_queue *q) { __blk_freeze_queue_start(q, NULL); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); /* non_owner variant of blk_mq_unfreeze_queue */ void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) { __blk_mq_unfreeze_queue(q, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->queue_lock, flags); if (!q->quiesce_depth++) blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has * been started on or more of the request_queues of the tag_set. This * function only waits for the quiesce on those request_queues that had * the quiesce flag set using blk_mq_quiesce_queue_nowait. */ void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) { if (set->flags & BLK_MQ_F_BLOCKING) synchronize_srcu(set->srcu); else synchronize_rcu(); } EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); /** * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * * Note: this function does not prevent that the struct request end_io() * callback function is invoked. Once this function is returned, we make * sure no dispatch can happen until the queue is unquiesced via * blk_mq_unquiesce_queue(). */ void blk_mq_quiesce_queue(struct request_queue *q) { blk_mq_quiesce_queue_nowait(q); /* nothing to wait for non-mq queues */ if (queue_is_mq(q)) blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); /* * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() * @q: request queue. * * This function recovers queue into the state before quiescing * which is done by blk_mq_quiesce_queue. */ void blk_mq_unquiesce_queue(struct request_queue *q) { unsigned long flags; bool run_queue = false; spin_lock_irqsave(&q->queue_lock, flags); if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { ; } else if (!--q->quiesce_depth) { blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); run_queue = true; } spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ if (run_queue) blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; mutex_lock(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } mutex_unlock(&set->tag_list_lock); blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; mutex_lock(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_unquiesce_queue(q); } mutex_unlock(&set->tag_list_lock); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_wakeup_all(hctx->tags, true); } void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = blk_time_get_ns(); blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); /* Set start and alloc time when the allocated request is actually used */ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q)) rq->alloc_time_ns = alloc_time_ns; else rq->alloc_time_ns = 0; #endif } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag) { struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; struct request_queue *q = data->q; struct request *rq = tags->static_rqs[tag]; rq->q = q; rq->mq_ctx = ctx; rq->mq_hctx = hctx; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; } rq->timeout = 0; rq->part = NULL; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; rq->nr_integrity_segments = 0; rq->end_io = NULL; rq->end_io_data = NULL; blk_crypto_rq_set_defaults(rq); INIT_LIST_HEAD(&rq->queuelist); /* tag was already set */ WRITE_ONCE(rq->deadline, 0); req_ref_set(rq, 1); if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = data->q->elevator; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); if (e->type->ops.prepare_request) e->type->ops.prepare_request(rq); } return rq; } static inline struct request * __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) { unsigned int tag, tag_offset; struct blk_mq_tags *tags; struct request *rq; unsigned long tag_mask; int i, nr = 0; tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); if (unlikely(!tag_mask)) return NULL; tags = blk_mq_tags_from_data(data); for (i = 0; tag_mask; i++) { if (!(tag_mask & (1UL << i))) continue; tag = tag_offset + i; prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); rq = blk_mq_rq_ctx_init(data, tags, tag); rq_list_add_head(data->cached_rqs, rq); nr++; } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; return rq_list_pop(data->cached_rqs); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; u64 alloc_time_ns = 0; struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = blk_time_get_ns(); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is * enabled for the queue. */ data->rq_flags |= RQF_SCHED_TAGS; /* * Flush/passthrough requests are special and go directly to the * dispatch list. */ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && !blk_op_is_passthrough(data->cmd_flags)) { struct elevator_mq_ops *ops = &q->elevator->type->ops; WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); data->rq_flags |= RQF_USE_SCHED; if (ops->limit_depth) ops->limit_depth(data->cmd_flags, data); } } else { blk_mq_tag_busy(data->hctx); } if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; /* * Try batched alloc if we want more than 1 tag. */ if (data->nr_tags > 1) { rq = __blk_mq_alloc_requests_batch(data); if (rq) { blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } data->nr_tags = 1; } /* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug * should have migrated us to an online CPU by now. */ tag = blk_mq_get_tag(data); if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; /* * Give up the CPU and sleep for a random short time to * ensure that thread using a realtime scheduling class * are migrated off the CPU, and thus off the hctx that * is going away. */ msleep(3); goto retry; } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data->hctx); rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } static struct request *blk_mq_rq_cache_fill(struct request_queue *q, struct blk_plug *plug, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .shallow_depth = 0, .cmd_flags = opf, .rq_flags = 0, .nr_tags = plug->nr_ios, .cached_rqs = &plug->cached_rqs, .ctx = NULL, .hctx = NULL }; struct request *rq; if (blk_queue_enter(q, flags)) return NULL; plug->nr_ios = 1; rq = __blk_mq_alloc_requests(&data); if (unlikely(!rq)) blk_queue_exit(q); return rq; } static struct request *blk_mq_alloc_cached_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_plug *plug = current->plug; struct request *rq; if (!plug) return NULL; if (rq_list_empty(&plug->cached_rqs)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; rq_list_pop(&plug->cached_rqs); blk_mq_rq_time_init(rq, blk_time_get_ns()); } rq->cmd_flags = opf; INIT_LIST_HEAD(&rq->queuelist); return rq; } struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct request *rq; rq = blk_mq_alloc_cached_request(q, opf, flags); if (!rq) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .shallow_depth = 0, .cmd_flags = opf, .rq_flags = 0, .nr_tags = 1, .cached_rqs = NULL, .ctx = NULL, .hctx = NULL }; int ret; ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); rq = __blk_mq_alloc_requests(&data); if (!rq) goto out_queue_exit; } rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .shallow_depth = 0, .cmd_flags = opf, .rq_flags = 0, .nr_tags = 1, .cached_rqs = NULL, .ctx = NULL, .hctx = NULL }; u64 alloc_time_ns = 0; struct request *rq; unsigned int cpu; unsigned int tag; int ret; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = blk_time_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a * different hardware context. No need to complicate the low level * allocator for this for the rare use case of a command tied to * a specific queue. */ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO); ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); /* * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ ret = -EXDEV; data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); if (q->elevator) data.rq_flags |= RQF_SCHED_TAGS; else blk_mq_tag_busy(data.hctx); if (flags & BLK_MQ_REQ_RESERVED) data.rq_flags |= RQF_RESV; ret = -EWOULDBLOCK; tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; if (!(data.rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data.hctx); rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); static void blk_mq_finish_request(struct request *rq) { struct request_queue *q = rq->q; blk_zone_finish_request(rq); if (rq->rq_flags & RQF_USE_SCHED) { q->elevator->type->ops.finish_request(rq); /* * For postflush request that may need to be * completed twice, we should clear this flag * to avoid double finish_request() on the rq. */ rq->rq_flags &= ~RQF_USE_SCHED; } } static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; if (rq->tag != BLK_MQ_NO_TAG) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, ctx, rq->tag); } if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); blk_queue_exit(q); } void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_finish_request(rq); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) blk_mq_free_request(rq); } void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, rq->q->disk ? rq->q->disk->disk_name : "?", (__force unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); } EXPORT_SYMBOL(blk_dump_rq_flags); static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->rq_flags & RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); part_stat_add(req->part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } static void blk_print_req_error(struct request *req, blk_status_t status) { printk_ratelimited(KERN_ERR "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", blk_status_to_str(status), req->q->disk ? req->q->disk->disk_name : "?", blk_rq_pos(req), (__force u32)req_op(req), blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* * Fully end IO on a request. Does not support partial completions, or * errors. */ static void blk_complete_request(struct request *req) { const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; int total_bytes = blk_rq_bytes(req); struct bio *bio = req->bio; trace_block_rq_complete(req, BLK_STS_OK, total_bytes); if (!bio) return; if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) blk_integrity_complete(req, total_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ blk_crypto_rq_put_keyslot(req); blk_account_io_completion(req, total_bytes); do { struct bio *next = bio->bi_next; /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (blk_req_bio_is_zone_append(req, bio)) blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); bio = next; } while (bio); /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ if (!req->end_io) { req->bio = NULL; req->__data_len = 0; } } /** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; bool quiet = req->rq_flags & RQF_QUIET; int total_bytes; trace_block_rq_complete(req, error, nr_bytes); if (!req->bio) return false; if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) blk_integrity_complete(req, nr_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) __blk_crypto_rq_put_keyslot(req); if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && !test_bit(GD_DEAD, &req->q->disk->state)) { blk_print_req_error(req, error); trace_block_rq_error(req, error, nr_bytes); } blk_account_io_completion(req, nr_bytes); total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); if (unlikely(error)) bio->bi_status = error; if (bio_bytes == bio->bi_iter.bi_size) { req->bio = bio->bi_next; } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { /* * Partial zone append completions cannot be supported * as the BIO fragments may end up not being written * sequentially. */ bio->bi_status = BLK_STS_IOERR; } /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (unlikely(quiet)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, bio_bytes); /* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { if (blk_req_bio_is_zone_append(req, bio)) blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); } total_bytes += bio_bytes; nr_bytes -= bio_bytes; if (!nr_bytes) break; } /* * completely done */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ req->__data_len = 0; return false; } req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ req->nr_phys_segments = blk_recalc_rq_segments(req); } return true; } EXPORT_SYMBOL_GPL(blk_update_request); static inline void blk_account_io_done(struct request *req, u64 now) { trace_block_io_done(req); /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. */ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_local_dec(req->part, in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } static inline bool blk_rq_passthrough_stats(struct request *req) { struct bio *bio = req->bio; if (!blk_queue_passthrough_stat(req->q)) return false; /* Requests without a bio do not transfer data. */ if (!bio) return false; /* * Stats are accumulated in the bdev, so must have one attached to a * bio to track stats. Most drivers do not set the bdev for passthrough * requests, but nvme is one that will set it. */ if (!bio->bi_bdev) return false; /* * We don't know what a passthrough command does, but we know the * payload size and data direction. Ensuring the size is aligned to the * block size filters out most commands with payloads that don't * represent sector access. */ if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) return false; return true; } static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); if (!blk_queue_io_stat(req->q)) return; if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) return; req->rq_flags |= RQF_IO_STAT; req->start_time_ns = blk_time_get_ns(); /* * All non-passthrough requests are created from a bio with one * exception: when a flush command that is part of a flush sequence * generated by the state machine in blk-flush.c is cloned onto the * lower device by dm-multipath we can get here without a bio. */ if (req->bio) req->part = req->bio->bi_bdev; else req->part = req->q->disk->part0; part_stat_lock(); update_io_ticks(req->part, jiffies, false); part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { if (rq->rq_flags & RQF_STATS) blk_stat_add(rq, now); blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); } inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) __blk_mq_end_request_acct(rq, blk_time_get_ns()); blk_mq_finish_request(rq); if (rq->end_io) { rq_qos_done(rq->q, rq); if (rq->end_io(rq, error) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); } } EXPORT_SYMBOL(__blk_mq_end_request); void blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); __blk_mq_end_request(rq, error); } EXPORT_SYMBOL(blk_mq_end_request); #define TAG_COMP_BATCH 32 static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, int *tag_array, int nr_tags) { struct request_queue *q = hctx->queue; blk_mq_sub_active_requests(hctx, nr_tags); blk_mq_put_tags(hctx->tags, tag_array, nr_tags); percpu_ref_put_many(&q->q_usage_counter, nr_tags); } void blk_mq_end_request_batch(struct io_comp_batch *iob) { int tags[TAG_COMP_BATCH], nr_tags = 0; struct blk_mq_hw_ctx *cur_hctx = NULL; struct request *rq; u64 now = 0; if (iob->need_ts) now = blk_time_get_ns(); while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); prefetch(rq->rq_next); blk_complete_request(rq); if (iob->need_ts) __blk_mq_end_request_acct(rq, now); blk_mq_finish_request(rq); rq_qos_done(rq->q, rq); /* * If end_io handler returns NONE, then it still has * ownership of the request. */ if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (!req_ref_put_and_test(rq)) continue; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { if (cur_hctx) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); nr_tags = 0; cur_hctx = rq->mq_hctx; } tags[nr_tags++] = rq->tag; } if (nr_tags) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); } EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); static void blk_complete_reqs(struct llist_head *list) { struct llist_node *entry = llist_reverse_order(llist_del_all(list)); struct request *rq, *next; llist_for_each_entry_safe(rq, next, entry, ipi_list) rq->q->mq_ops->complete(rq); } static __latent_entropy void blk_done_softirq(void) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } static int blk_softirq_cpu_dead(unsigned int cpu) { blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); return 0; } static void __blk_mq_complete_request_remote(void *data) { __raise_softirq_irqoff(BLOCK_SOFTIRQ); } static inline bool blk_mq_complete_need_ipi(struct request *rq) { int cpu = raw_smp_processor_id(); if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) return false; /* * With force threaded interrupts enabled, raising softirq from an SMP * function call will always result in waking the ksoftirqd thread. * This is probably worse than completing the request on a different * cache domain. */ if (force_irqthreads()) return false; /* same CPU or cache domain and capacity? Complete locally */ if (cpu == rq->mq_ctx->cpu || (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && cpus_share_cache(cpu, rq->mq_ctx->cpu) && cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) return false; /* don't try to IPI to an offline CPU */ return cpu_online(rq->mq_ctx->cpu); } static void blk_mq_complete_send_ipi(struct request *rq) { unsigned int cpu; cpu = rq->mq_ctx->cpu; if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); } static void blk_mq_raise_softirq(struct request *rq) { struct llist_head *list; preempt_disable(); list = this_cpu_ptr(&blk_cpu_done); if (llist_add(&rq->ipi_list, list)) raise_softirq(BLOCK_SOFTIRQ); preempt_enable(); } bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* * For request which hctx has only one ctx mapping, * or a polled request, always complete locally, * it's pointless to redirect the completion. */ if ((rq->mq_hctx->nr_ctx == 1 && rq->mq_ctx->cpu == raw_smp_processor_id()) || rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { blk_mq_complete_send_ipi(rq); return true; } if (rq->q->nr_hw_queues == 1) { blk_mq_raise_softirq(rq); return true; } return false; } EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed * * Description: * Complete a request by scheduling the ->complete_rq operation. **/ void blk_mq_complete_request(struct request *rq) { if (!blk_mq_complete_request_remote(rq)) rq->q->mq_ops->complete(rq); } EXPORT_SYMBOL(blk_mq_complete_request); /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started * * Function used by device drivers to notify the block layer that a request * is going to be processed now, so blk layer can do proper initializations * such as starting the timeout timer. */ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && !blk_rq_is_passthrough(rq)) { rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) blk_integrity_prepare(rq); if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } EXPORT_SYMBOL(blk_mq_start_request); /* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging * requests. */ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) { if (plug->multiple_queues) return BLK_MAX_REQUEST_COUNT * 2; return BLK_MAX_REQUEST_COUNT; } static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) { struct request *last = rq_list_peek(&plug->mq_list); if (!plug->rq_count) { trace_block_plug(rq->q); } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || (!blk_queue_nomerges(rq->q) && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_mq_flush_plug_list(plug, false); last = NULL; trace_block_plug(rq->q); } if (!plug->multiple_queues && last && last->q != rq->q) plug->multiple_queues = true; /* * Any request allocated from sched tags can't be issued to * ->queue_rqs() directly */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; rq_list_add_tail(&plug->mq_list, rq); plug->rq_count++; } /** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. * * Note: * This function will invoke @done directly if the queue is dead. */ void blk_execute_rq_nowait(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); blk_account_io_start(rq); if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); return; } blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); struct blk_rq_wait { struct completion done; blk_status_t ret; }; static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) { struct blk_rq_wait *wait = rq->end_io_data; wait->ret = ret; complete(&wait->done); return RQ_END_IO_NONE; } bool blk_rq_is_poll(struct request *rq) { if (!rq->mq_hctx) return false; if (rq->mq_hctx->type != HCTX_TYPE_POLL) return false; return true; } EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); cond_resched(); } while (!completion_done(wait)); } /** * blk_execute_rq - insert a request into queue for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. * Return: The blk_status_t result provided to blk_mq_end_request(). */ blk_status_t blk_execute_rq(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_rq_wait wait = { .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), }; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); rq->end_io_data = &wait; rq->end_io = blk_end_sync_rq; blk_account_io_start(rq); blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, false); if (blk_rq_is_poll(rq)) blk_rq_poll_completion(rq, &wait.done); else blk_wait_io(&wait.done); return wait.ret; } EXPORT_SYMBOL(blk_execute_rq); static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_put_driver_tag(rq); trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; } } void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { struct request_queue *q = rq->q; unsigned long flags; __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); spin_lock_irqsave(&q->requeue_lock, flags); list_add_tail(&rq->queuelist, &q->requeue_list); spin_unlock_irqrestore(&q->requeue_lock, flags); if (kick_requeue_list) blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); LIST_HEAD(flush_list); struct request *rq; spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); list_splice_init(&q->flush_list, &flush_list); spin_unlock_irq(&q->requeue_lock); while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); /* * If RQF_DONTPREP is set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request. */ if (rq->rq_flags & RQF_DONTPREP) blk_mq_request_bypass_insert(rq, 0); else blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); } while (!list_empty(&flush_list)) { rq = list_entry(flush_list.next, struct request, queuelist); list_del_init(&rq->queuelist); blk_mq_insert_request(rq, 0); } blk_mq_run_hw_queues(q, false); } void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); static bool blk_is_flush_data_rq(struct request *rq) { return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq); } static bool blk_mq_rq_inflight(struct request *rq, void *priv) { /* * If we find a request that isn't idle we know the queue is busy * as it's checked in the iter. * Return false to stop the iteration. * * In case of queue quiesce, if one flush data request is completed, * don't count it as inflight given the flush sequence is suspended, * and the original flush data request is invisible to driver, just * like other pending requests because of quiesce */ if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) && blk_is_flush_data_rq(rq) && blk_mq_request_completed(rq))) { bool *busy = priv; *busy = true; return false; } return true; } bool blk_mq_queue_inflight(struct request_queue *q) { bool busy = false; blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); return busy; } EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); static void blk_mq_rq_timed_out(struct request *req) { req->rq_flags |= RQF_TIMED_OUT; if (req->q->mq_ops->timeout) { enum blk_eh_timer_return ret; ret = req->q->mq_ops->timeout(req); if (ret == BLK_EH_DONE) return; WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); } blk_add_timer(req); } struct blk_expired_data { bool has_timedout_rq; unsigned long next; unsigned long timeout_start; }; static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) { unsigned long deadline; if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) return false; if (rq->rq_flags & RQF_TIMED_OUT) return false; deadline = READ_ONCE(rq->deadline); if (time_after_eq(expired->timeout_start, deadline)) return true; if (expired->next == 0) expired->next = deadline; else if (time_after(expired->next, deadline)) expired->next = deadline; return false; } void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { if (rq->end_io(rq, 0) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); } } static bool blk_mq_check_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; /* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot * be reallocated underneath the timeout handler's processing, then * the expire check is reliable. If the request is not expired, then * it was completed and reallocated as a new request after returning * from blk_mq_check_expired(). */ if (blk_mq_req_expired(rq, expired)) { expired->has_timedout_rq = true; return false; } return true; } static bool blk_mq_handle_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; if (blk_mq_req_expired(rq, expired)) blk_mq_rq_timed_out(rq); return true; } static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); struct blk_expired_data expired = { .timeout_start = jiffies, }; struct blk_mq_hw_ctx *hctx; unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting * completion, since the timeout code would not be able to * acquire the queue reference here. * * That's why we don't use blk_queue_enter here; instead, we use * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in * blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; /* check if there is any timed-out request */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); if (expired.has_timedout_rq) { /* * Before walking tags, we must ensure any submit started * before the current time has finished. Since the submit * uses srcu or rcu, wait for a synchronization point to * ensure all running submits have finished */ blk_mq_wait_quiesce_done(q->tag_set); expired.next = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); } if (expired.next != 0) { mod_timer(&q->timeout, expired.next); } else { /* * Request timeouts are handled as a forward rolling timer. If * we end up here it means that no requests are pending and * also that no request has been pending for a while. Mark * each hctx as idle. */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); } } blk_queue_exit(q); } struct flush_busy_ctx_data { struct blk_mq_hw_ctx *hctx; struct list_head *list; }; static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; } /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct flush_busy_ctx_data data = { .hctx = hctx, .list = list, }; sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; struct request *rq; }; static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); return !dispatch_data->rq; } struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start) { unsigned off = start ? start->index_hw[hctx->type] : 0; struct dispatch_rq_data data = { .hctx = hctx, .rq = NULL, }; __sbitmap_for_each_set(&hctx->ctx_map, off, dispatch_rq_from_ctx, &data); return data.rq; } bool __blk_mq_alloc_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag; blk_mq_tag_busy(rq->mq_hctx); if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; } else { if (!hctx_may_queue(rq->mq_hctx, bt)) return false; } tag = __sbitmap_queue_get(bt); if (tag == BLK_MQ_NO_TAG) return false; rq->tag = tag + tag_offset; blk_mq_inc_active_requests(rq->mq_hctx); return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx; hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { struct sbitmap_queue *sbq; list_del_init(&wait->entry); sbq = &hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); blk_mq_run_hw_queue(hctx, true); return 1; } /* * Mark us waiting for a tag. For shared tags, this involves hooking us into * the tag wakeups. For non-shared tags, we can simply mark us needing a * restart. For both cases, take care to check the condition again after * marking us as waiting. */ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct sbitmap_queue *sbq; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && !(blk_mq_is_shared_tags(hctx->flags))) { blk_mq_sched_mark_restart_hctx(hctx); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. * * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ return blk_mq_get_driver_tag(rq); } wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) return false; if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) sbq = &hctx->tags->breserved_tags; else sbq = &hctx->tags->bitmap_tags; wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } atomic_inc(&sbq->ws_active); wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); /* * Add one explicit barrier since blk_mq_get_driver_tag() may * not imply barrier in case of failure. * * Order adding us to wait queue and allocating driver tag. * * The pair is the one implied in sbitmap_queue_wake_up() which * orders clearing sbitmap tag bits and waitqueue_active() in * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless * * Otherwise, re-order of adding wait queue and getting driver tag * may cause __sbitmap_queue_wake_up() to wake up nothing because * the waitqueue_active() may not observe us in wait queue. */ smp_mb(); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ ret = blk_mq_get_driver_tag(rq); if (!ret) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } /* * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup. */ list_del_init(&wait->entry); atomic_dec(&sbq->ws_active); spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return true; } #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 /* * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): * - EWMA is one simple way to compute running average value * - weight(7/8 and 1/8) is applied so that it can decrease exponentially * - take 4 as factor for avoiding to get too small(0) result, and this * factor doesn't matter because EWMA decreases exponentially */ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; ewma = hctx->dispatch_busy; if (!ewma && !busy) return; ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; if (busy) ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; hctx->dispatch_busy = ewma; } #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ static void blk_mq_handle_dev_resource(struct request *rq, struct list_head *list) { list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); } enum prep_dispatch { PREP_DISPATCH_OK, PREP_DISPATCH_NO_TAG, PREP_DISPATCH_NO_BUDGET, }; static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, bool need_budget) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; int budget_token = -1; if (need_budget) { budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) { blk_mq_put_driver_tag(rq); return PREP_DISPATCH_NO_BUDGET; } blk_mq_set_rq_budget_token(rq, budget_token); } if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The * waitqueue takes care of that. If the queue is run * before we add this entry back on the dispatch list, * we'll re-run it below. */ if (!blk_mq_mark_tag_wait(hctx, rq)) { /* * All budgets not got from this function will be put * together during handling partial dispatch */ if (need_budget) blk_mq_put_dispatch_budget(rq->q, budget_token); return PREP_DISPATCH_NO_TAG; } } return PREP_DISPATCH_OK; } /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ static void blk_mq_release_budgets(struct request_queue *q, struct list_head *list) { struct request *rq; list_for_each_entry(rq, list, queuelist) { int budget_token = blk_mq_get_rq_budget_token(rq); if (budget_token >= 0) blk_mq_put_dispatch_budget(q, budget_token); } } /* * blk_mq_commit_rqs will notify driver using bd->last that there is no * more requests. (See comment in struct blk_mq_ops for commit_rqs for * details) * Attention, we should explicitly call this in unusual cases: * 1) did not queue everything initially scheduled to queue * 2) the last attempt to queue a request failed */ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, bool from_schedule) { if (hctx->queue->mq_ops->commit_rqs && queued) { trace_block_unplug(hctx->queue, queued, !from_schedule); hctx->queue->mq_ops->commit_rqs(hctx); } } /* * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool get_budget) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; struct request *rq; int queued; blk_status_t ret = BLK_STS_OK; bool needs_resource = false; if (list_empty(list)) return false; /* * Now process all the entries, sending them to the driver. */ queued = 0; do { struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); prep = blk_mq_prep_dispatch_rq(rq, get_budget); if (prep != PREP_DISPATCH_OK) break; list_del_init(&rq->queuelist); bd.rq = rq; bd.last = list_empty(list); ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: needs_resource = true; fallthrough; case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; default: blk_mq_end_request(rq, ret); } } while (!list_empty(list)); out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ if (!list_empty(list) || ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. */ if (!list_empty(list)) { bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || blk_mq_is_shared_tags(hctx->flags)); /* * If the caller allocated budgets, free the budgets of the * requests that have not yet been passed to the block driver. */ if (!get_budget) blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* * Order adding requests to hctx->dispatch and checking * SCHED_RESTART flag. The pair of this smp_mb() is the one * in blk_mq_sched_restart(). Avoid restart code path to * miss the new added requests to hctx->dispatch, meantime * SCHED_RESTART is observed here. */ smp_mb(); /* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * * If 'no_tag' is set, that means that we failed getting * a driver tag with an I/O scheduler attached. If our dispatch * waitqueue is no longer active, ensure that we run the queue * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests * were pushed back onto the dispatch list. Rerun the queue to * avoid starvation. Notes: * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do * similar if we couldn't get budget or couldn't lock a zone * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); if (prep == PREP_DISPATCH_NO_BUDGET) needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); else if (needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); return false; } blk_mq_update_dispatch_busy(hctx, false); return true; } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) { int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(hctx->cpumask); return cpu; } /* * ->next_cpu is always calculated from hctx->cpumask, so simply use * it for speeding up the check */ static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) { return hctx->next_cpu >= nr_cpu_ids; } /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. * For now we just round-robin here, switching for every * BLK_MQ_CPU_WORK_BATCH queued items. */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { bool tried = false; int next_cpu = hctx->next_cpu; /* Switch to unbound if no allowable CPUs in this hctx */ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { select_cpu: next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } /* * Do unbound schedule if we can't find a online CPU for this hctx, * and it should only happen in the path of handling CPU DEAD. */ if (!cpu_online(next_cpu)) { if (!tried) { tried = true; goto select_cpu; } /* * Make sure to re-select CPU next time once after CPUs * in hctx->cpumask become online again. */ hctx->next_cpu = next_cpu; hctx->next_cpu_batch = 1; return WORK_CPU_UNBOUND; } hctx->next_cpu = next_cpu; return next_cpu; } /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx))) return; kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) { bool need_run; /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even blk_mq_hctx_has_pending() can't be called safely. * * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ __blk_mq_run_dispatch_ops(hctx->queue, false, need_run = !blk_queue_quiesced(hctx->queue) && blk_mq_hctx_has_pending(hctx)); return need_run; } /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. * * Check if the request queue is not in a quiesced state and if there are * pending requests to be sent. If this is true, run the queue to send requests * to hardware. */ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { bool need_run; /* * We can't run the queue inline with interrupts disabled. */ WARN_ON_ONCE(!async && in_interrupt()); might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); need_run = blk_mq_hw_queue_need_run(hctx); if (!need_run) { unsigned long flags; /* * Synchronize with blk_mq_unquiesce_queue(), because we check * if hw queue is quiesced locklessly above, we need the use * ->queue_lock to make sure we see the up-to-date status to * not miss rerunning the hw queue. */ spin_lock_irqsave(&hctx->queue->queue_lock, flags); need_run = blk_mq_hw_queue_need_run(hctx); spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); if (!need_run) return; } if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); return; } blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } EXPORT_SYMBOL(blk_mq_run_hw_queue); /* * Return prefered queue to dispatch from (if any) for non-mq aware IO * scheduler. */ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); /* * If the IO scheduler does not respect hardware queues when * dispatching, we just don't bother with multiple HW queues and * dispatch from hctx for the current CPU since running multiple queues * just causes lock contention inside the scheduler and pointless cache * bouncing. */ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; if (!blk_mq_hctx_stopped(hctx)) return hctx; return NULL; } /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. * @async: If we want to run the queue asynchronously. */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * If there is already a run_work pending, leave the * pending delay untouched. Otherwise, a hctx can stall * if another hctx is re-delaying the other's work * before the work executes. */ if (delayed_work_pending(&hctx->run_work)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queue() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { cancel_delayed_work(&hctx->run_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } EXPORT_SYMBOL(blk_mq_stop_hw_queue); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queues() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_stop_hw_queues); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_start_hw_queues); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { if (!blk_mq_hctx_stopped(hctx)) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); /* * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch * list in the subsequent routine. */ smp_mb__after_atomic(); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_stopped_hw_queue(hctx, async || (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } /** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. * @flags: BLK_MQ_INSERT_* * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &hctx->dispatch); else list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); } static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { struct request *rq; enum hctx_type type = hctx->type; /* * Try to issue requests directly if the hw queue isn't busy to save an * extra enqueue & dequeue to the sw queue. */ if (!hctx->dispatch_busy && !run_queue_async) { blk_mq_run_dispatch_ops(hctx->queue, blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) goto out; } /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); trace_block_rq_insert(rq); if (rq->cmd_flags & REQ_NOWAIT) run_queue_async = true; } spin_lock(&ctx->lock); list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); out: blk_mq_run_hw_queue(hctx, run_queue_async); } static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_rq_is_passthrough(rq)) { /* * Passthrough request have to be added to hctx->dispatch * directly. The device may be in a situation where it can't * handle FS request, and always returns BLK_STS_RESOURCE for * them, which gets them added to hctx->dispatch. * * If a passthrough request is required to unblock the queues, * and it is added to the scheduler queue, there is no chance to * dispatch it given we prioritize requests in hctx->dispatch. */ blk_mq_request_bypass_insert(rq, flags); } else if (req_op(rq) == REQ_OP_FLUSH) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( * hctx->dispatch) directly and there is at most one in-flight * flush request for each hw queue, so it doesn't matter to add * flush request to tail or front of the dispatch queue. * * Secondly in case of NCQ, flush request belongs to non-NCQ * command, and queueing it will fail when there is any * in-flight normal IO request(NCQ command). When adding flush * rq to the front of hctx->dispatch, it is easier to introduce * extra time to flush rq's latency because of S_SCHED_RESTART * compared with adding to the tail of dispatch queue, then * chance of flush merge is increased, and less flush requests * will be issued to controller. It is observed that ~10% time * is saved in blktests block/004 on disk attached to AHCI/NCQ * drive when adding flush rq to the front of hctx->dispatch. * * Simply queue flush rq to the front of hctx->dispatch so that * intensive flush workloads can benefit in case of NCQ HW. */ blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); } else if (q->elevator) { LIST_HEAD(list); WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); list_add(&rq->queuelist, &list); q->elevator->type->ops.insert_requests(hctx, &list, flags); } else { trace_block_rq_insert(rq); spin_lock(&ctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); else list_add_tail(&rq->queuelist, &ctx->rq_lists[hctx->type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { int err; if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; rq->__data_len = bio->bi_iter.bi_size; rq->nr_phys_segments = nr_segs; if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, bio); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); WARN_ON_ONCE(err); blk_account_io_start(rq); } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; blk_status_t ret; /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we * previously would have done. */ ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_update_dispatch_busy(hctx, true); __blk_mq_requeue_request(rq); break; default: blk_mq_update_dispatch_busy(hctx, false); break; } return ret; } static bool blk_mq_get_budget_and_tag(struct request *rq) { int budget_token; budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) return false; blk_mq_set_rq_budget_token(rq, budget_token); if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(rq->q, budget_token); return false; } return true; } /** * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so * we can try send it another time in the future. Requests inserted at this * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_status_t ret; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return; } if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); return; } ret = __blk_mq_issue_directly(hctx, rq, true); switch (ret) { case BLK_STS_OK: break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); break; default: blk_mq_end_request(rq, ret); break; } } static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return BLK_STS_OK; } if (!blk_mq_get_budget_and_tag(rq)) return BLK_STS_RESOURCE; return __blk_mq_issue_directly(hctx, rq, last); } static void blk_mq_issue_direct(struct rq_list *rqs) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(rqs))) { bool last = rq_list_empty(rqs); if (hctx != rq->mq_hctx) { if (hctx) { blk_mq_commit_rqs(hctx, queued, false); queued = 0; } hctx = rq->mq_hctx; } ret = blk_mq_request_issue_directly(rq, last); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs) { if (blk_queue_quiesced(q)) return; q->mq_ops->queue_rqs(rqs); } static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs, struct rq_list *queue_rqs) { struct request *rq = rq_list_pop(rqs); struct request_queue *this_q = rq->q; struct request **prev = &rqs->head; struct rq_list matched_rqs = {}; struct request *last = NULL; unsigned depth = 1; rq_list_add_tail(&matched_rqs, rq); while ((rq = *prev)) { if (rq->q == this_q) { /* move rq from rqs to matched_rqs */ *prev = rq->rq_next; rq_list_add_tail(&matched_rqs, rq); depth++; } else { /* leave rq in rqs */ prev = &rq->rq_next; last = rq; } } rqs->tail = last; *queue_rqs = matched_rqs; return depth; } static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth) { struct request_queue *q = rq_list_peek(rqs)->q; trace_block_unplug(q, depth, true); /* * Peek first request and see if we have a ->queue_rqs() hook. * If we do, we can dispatch the whole list in one go. * We already know at this point that all requests belong to the * same queue, caller must ensure that's the case. */ if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); if (rq_list_empty(rqs)) return; } blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs)); } static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; struct rq_list requeue_list = {}; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); do { struct request *rq = rq_list_pop(rqs); if (!this_hctx) { this_hctx = rq->mq_hctx; this_ctx = rq->mq_ctx; is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { rq_list_add_tail(&requeue_list, rq); continue; } list_add_tail(&rq->queuelist, &list); depth++; } while (!rq_list_empty(rqs)); *rqs = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); /* passthrough requests should never be issued to the I/O scheduler */ if (is_passthrough) { spin_lock(&this_hctx->lock); list_splice_tail_init(&list, &this_hctx->dispatch); spin_unlock(&this_hctx->lock); blk_mq_run_hw_queue(this_hctx, from_sched); } else if (this_hctx->queue->elevator) { this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, &list, 0); blk_mq_run_hw_queue(this_hctx, from_sched); } else { blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); } percpu_ref_put(&this_hctx->queue->q_usage_counter); } static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs) { do { struct rq_list queue_rqs; unsigned depth; depth = blk_mq_extract_queue_requests(rqs, &queue_rqs); blk_mq_dispatch_queue_requests(&queue_rqs, depth); while (!rq_list_empty(&queue_rqs)) blk_mq_dispatch_list(&queue_rqs, false); } while (!rq_list_empty(rqs)); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { unsigned int depth; /* * We may have been called recursively midway through handling * plug->mq_list via a schedule() in the driver's queue_rq() callback. * To avoid mq_list changing under our feet, clear rq_count early and * bail out specifically if rq_count is 0 rather than checking * whether the mq_list is empty. */ if (plug->rq_count == 0) return; depth = plug->rq_count; plug->rq_count = 0; if (!plug->has_elevator && !from_schedule) { if (plug->multiple_queues) { blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); return; } blk_mq_dispatch_queue_requests(&plug->mq_list, depth); if (rq_list_empty(&plug->mq_list)) return; } do { blk_mq_dispatch_list(&plug->mq_list, from_schedule); } while (!rq_list_empty(&plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; blk_status_t ret = BLK_STS_OK; while (!list_empty(list)) { struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); if (list_empty(list)) blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static bool blk_mq_attempt_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { if (blk_attempt_plug_merge(q, bio, nr_segs)) return true; if (blk_mq_sched_bio_merge(q, bio, nr_segs)) return true; } return false; } static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, struct bio *bio) { struct blk_mq_alloc_data data = { .q = q, .flags = 0, .shallow_depth = 0, .cmd_flags = bio->bi_opf, .rq_flags = 0, .nr_tags = 1, .cached_rqs = NULL, .ctx = NULL, .hctx = NULL }; struct request *rq; rq_qos_throttle(q, bio); if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; data.cached_rqs = &plug->cached_rqs; } rq = __blk_mq_alloc_requests(&data); if (unlikely(!rq)) rq_qos_cleanup(q, bio); return rq; } /* * Check if there is a suitable cached request and return it. */ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, struct request_queue *q, blk_opf_t opf) { enum hctx_type type = blk_mq_get_hctx_type(opf); struct request *rq; if (!plug) return NULL; rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type && (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; return rq; } static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { if (rq_list_pop(&plug->cached_rqs) != rq) WARN_ON_ONCE(1); /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ rq_qos_throttle(rq->q, bio); blk_mq_rq_time_init(rq, blk_time_get_ns()); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); } static bool bio_unaligned(const struct bio *bio, struct request_queue *q) { unsigned int bs_mask = queue_logical_block_size(q) - 1; /* .bi_sector of any zero sized bio need to be initialized */ if ((bio->bi_iter.bi_size & bs_mask) || ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) return true; return false; } /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. * * Builds up a request structure from @q and @bio and send to the device. The * request may not be queued directly to hardware if: * * This request can be merged with another one * * We want to place request at plug queue for possible future merging * * There is an IO scheduler active at this queue * * It will not queue the request if there is an error with the bio, or at the * request creation. */ void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; unsigned int nr_segs; struct request *rq; blk_status_t ret; /* * If the plug has a cached request for this queue, try to use it. */ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); /* * A BIO that was released from a zone write plug has already been * through the preparation in this function, already holds a reference * on the queue usage counter, and is the only write BIO in-flight for * the target zone. Go straight to preparing a request for it. */ if (bio_zone_write_plugging(bio)) { nr_segs = bio->__bi_nr_segments; if (rq) blk_queue_exit(q); goto new_request; } /* * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. */ if (!rq) { if (unlikely(bio_queue_enter(bio))) return; } /* * Device reconfiguration may change logical block size or reduce the * number of poll queues, so the checks for alignment and poll support * have to be done with queue usage counter held. */ if (unlikely(bio_unaligned(bio, q))) { bio_io_error(bio); goto queue_exit; } if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); goto queue_exit; } bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) goto queue_exit; if (!bio_integrity_prep(bio)) goto queue_exit; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; if (bio_needs_zone_write_plugging(bio)) { if (blk_zone_plug_bio(bio, nr_segs)) goto queue_exit; } new_request: if (rq) { blk_mq_use_cached_rq(rq, plug, bio); } else { rq = blk_mq_get_new_requests(q, plug, bio); if (unlikely(!rq)) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); goto queue_exit; } } trace_block_getrq(bio); rq_qos_track(q, rq, bio); blk_mq_bio_to_request(rq, bio, nr_segs); ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); return; } if (bio_zone_write_plugging(bio)) blk_zone_write_plug_init_request(rq); if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; if (plug) { blk_add_rq_to_plug(plug, rq); return; } hctx = rq->mq_hctx; if ((rq->rq_flags & RQF_USE_SCHED) || (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, true); } else { blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); } return; queue_exit: /* * Don't drop the queue reference if we were trying to use a cached * request and thus didn't acquire one. */ if (!rq) blk_queue_exit(q); } #ifdef CONFIG_BLK_MQ_STACKING /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @rq: the request being queued */ blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; unsigned int max_sectors = blk_queue_get_max_sectors(rq); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; if (blk_rq_sectors(rq) > max_sectors) { /* * SCSI device does not have a good way to return if * Write Same/Zero is actually supported. If a device rejects * a non-read/write command (discard, write same,etc.) the * low-level device driver will set the relevant queue limit to * 0 to prevent blk-lib from issuing more of the offending * operations. Commands queued prior to the queue limit being * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O * errors being propagated to upper layers. */ if (max_sectors == 0) return BLK_STS_NOTSUPP; printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", __func__, blk_rq_sectors(rq), max_sectors); return BLK_STS_IOERR; } /* * The queue settings related to segment counting may differ from the * original queue. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > max_segments) { printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n", __func__, rq->nr_phys_segments, max_segments); return BLK_STS_IOERR; } if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) return ret; blk_account_io_start(rq); /* * Since we have a scheduler attached on the top device, * bypass a potential scheduler on the bottom device for * insert. */ blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) blk_account_io_done(rq, blk_time_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, bs); if (!bio) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) { bio_put(bio); goto free_and_out; } if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else { rq->bio = rq->biotail = bio; } } /* Copy attributes of the original request to the clone request. */ rq->__sector = blk_rq_pos(rq_src); rq->__data_len = blk_rq_bytes(rq_src); if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { rq->rq_flags |= RQF_SPECIAL_PAYLOAD; rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->nr_integrity_segments = rq_src->nr_integrity_segments; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; return 0; free_and_out: blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); #endif /* CONFIG_BLK_MQ_STACKING */ /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. */ void blk_steal_bios(struct bio_list *list, struct request *rq) { if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; else list->head = rq->bio; list->tail = rq->biotail; rq->bio = NULL; rq->biotail = NULL; } rq->__data_len = 0; } EXPORT_SYMBOL_GPL(blk_steal_bios); static size_t order_to_size(unsigned int order) { return (size_t)PAGE_SIZE << order; } /* called before freeing request pool in @tags */ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags) { struct page *page; unsigned long flags; /* * There is no need to clear mapping if driver tags is not initialized * or the mapping belongs to the driver tags. */ if (!drv_tags || drv_tags == tags) return; list_for_each_entry(page, &tags->page_list, lru) { unsigned long start = (unsigned long)page_address(page); unsigned long end = start + order_to_size(page->private); int i; for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsigned long rq_addr = (unsigned long)rq; if (rq_addr >= start && rq_addr < end) { WARN_ON_ONCE(req_ref_read(rq) != 0); cmpxchg(&drv_tags->rqs[i], rq, NULL); } } } /* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ spin_lock_irqsave(&drv_tags->lock, flags); spin_unlock_irqrestore(&drv_tags->lock, flags); } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; struct page *page; if (list_empty(&tags->page_list)) return; if (blk_mq_is_shared_tags(set->flags)) drv_tags = set->shared_tags; else drv_tags = set->tags[hctx_idx]; if (tags->static_rqs && set->ops->exit_request) { int i; for (i = 0; i < tags->nr_tags; i++) { struct request *rq = tags->static_rqs[i]; if (!rq) continue; set->ops->exit_request(set, rq, hctx_idx); tags->static_rqs[i] = NULL; } } blk_mq_clear_rq_mapping(drv_tags, tags); while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); /* * Remove kmemleak object previously allocated in * blk_mq_alloc_rqs(). */ kmemleak_free(page_address(page)); __free_pages(page, page->private); } } void blk_mq_free_rq_map(struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; blk_mq_free_tags(tags); } static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, unsigned int hctx_idx) { int i; for (i = 0; i < set->nr_maps; i++) { unsigned int start = set->map[i].queue_offset; unsigned int end = start + set->map[i].nr_queues; if (hctx_idx >= start && hctx_idx < end) break; } if (i >= set->nr_maps) i = HCTX_TYPE_DEFAULT; return i; } static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, unsigned int hctx_idx) { enum hctx_type type = hctx_idx_to_type(set, hctx_idx); return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); } static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags) { int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; if (node == NUMA_NO_NODE) node = set->numa_node; tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); if (!tags) return NULL; tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) goto err_free_tags; tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->static_rqs) goto err_free_rqs; return tags; err_free_rqs: kfree(tags->rqs); err_free_tags: blk_mq_free_tags(tags); return NULL; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) { int ret; if (set->ops->init_request) { ret = set->ops->init_request(set, rq, hctx_idx, node); if (ret) return ret; } WRITE_ONCE(rq->state, MQ_RQ_IDLE); return 0; } static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; if (node == NUMA_NO_NODE) node = set->numa_node; INIT_LIST_HEAD(&tags->page_list); /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); left = rq_size * depth; for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; void *p; while (this_order && left < order_to_size(this_order - 1)) this_order--; do { page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) break; if (!this_order--) break; if (order_to_size(this_order) < rq_size) break; } while (1); if (!page) goto fail; page->private = this_order; list_add_tail(&page->lru, &tags->page_list); p = page_address(page); /* * Allow kmemleak to scan these pages as they contain pointers * to additional allocations like via ops->init_request(). */ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { struct request *rq = p; tags->static_rqs[i] = rq; if (blk_mq_init_request(set, rq, hctx_idx, node)) { tags->static_rqs[i] = NULL; goto fail; } p += rq_size; i++; } } return 0; fail: blk_mq_free_rqs(set, tags, hctx_idx); return -ENOMEM; } struct rq_iter_data { struct blk_mq_hw_ctx *hctx; bool has_rq; }; static bool blk_mq_has_request(struct request *rq, void *data) { struct rq_iter_data *iter_data = data; if (rq->mq_hctx != iter_data->hctx) return true; iter_data->has_rq = true; return false; } static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags; struct rq_iter_data data = { .hctx = hctx, }; blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); return data.has_rq; } static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, unsigned int this_cpu) { enum hctx_type type = hctx->type; int cpu; /* * hctx->cpumask has to rule out isolated CPUs, but userspace still * might submit IOs on these isolated CPUs, so use the queue map to * check if all CPUs mapped to this hctx are offline */ for_each_online_cpu(cpu) { struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, type, cpu); if (h != hctx) continue; /* this hctx has at least one online CPU */ if (this_cpu != cpu) return true; } return false; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); if (blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0; /* * Prevent new request from being allocated on the current hctx. * * The smp_mb__after_atomic() Pairs with the implied barrier in * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is * seen once we return from the tag allocator. */ set_bit(BLK_MQ_S_INACTIVE, &hctx->state); smp_mb__after_atomic(); /* * Try to grab a reference to the queue and wait for any outstanding * requests. If we could not grab a reference the queue has been * frozen and there are no requests. */ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { while (blk_mq_hctx_has_requests(hctx)) msleep(5); percpu_ref_put(&hctx->queue->q_usage_counter); } return 0; } /* * Check if one CPU is mapped to the specified hctx * * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed * to be used for scheduling kworker only. For other usage, please call this * helper for checking if one CPU belongs to the specified hctx */ static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu, const struct blk_mq_hw_ctx *hctx) { struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue, hctx->type, cpu); return mapped_hctx == hctx; } static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); if (blk_mq_cpu_mapped_to_hctx(cpu, hctx)) clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); return 0; } /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it * gets run. */ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx)) return 0; ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); if (list_empty(&tmp)) return 0; spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); return 0; } static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&blk_mq_cpuhp_lock); if (!(hctx->flags & BLK_MQ_F_STACKING) && !hlist_unhashed(&hctx->cpuhp_online)) { cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); INIT_HLIST_NODE(&hctx->cpuhp_online); } if (!hlist_unhashed(&hctx->cpuhp_dead)) { cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); INIT_HLIST_NODE(&hctx->cpuhp_dead); } } static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { mutex_lock(&blk_mq_cpuhp_lock); __blk_mq_remove_cpuhp(hctx); mutex_unlock(&blk_mq_cpuhp_lock); } static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&blk_mq_cpuhp_lock); if (!(hctx->flags & BLK_MQ_F_STACKING) && hlist_unhashed(&hctx->cpuhp_online)) cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); if (hlist_unhashed(&hctx->cpuhp_dead)) cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); } static void __blk_mq_remove_cpuhp_list(struct list_head *head) { struct blk_mq_hw_ctx *hctx; lockdep_assert_held(&blk_mq_cpuhp_lock); list_for_each_entry(hctx, head, hctx_list) __blk_mq_remove_cpuhp(hctx); } /* * Unregister cpuhp callbacks from exited hw queues * * Safe to call if this `request_queue` is live */ static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) { LIST_HEAD(hctx_list); spin_lock(&q->unused_hctx_lock); list_splice_init(&q->unused_hctx_list, &hctx_list); spin_unlock(&q->unused_hctx_lock); mutex_lock(&blk_mq_cpuhp_lock); __blk_mq_remove_cpuhp_list(&hctx_list); mutex_unlock(&blk_mq_cpuhp_lock); spin_lock(&q->unused_hctx_lock); list_splice(&hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); } /* * Register cpuhp callbacks from all hw queues * * Safe to call if this `request_queue` is live */ static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; mutex_lock(&blk_mq_cpuhp_lock); queue_for_each_hw_ctx(q, hctx, i) __blk_mq_add_cpuhp(hctx); mutex_unlock(&blk_mq_cpuhp_lock); } /* * Before freeing hw queue, clearing the flush request reference in * tags->rqs[] for avoiding potential UAF. */ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsigned int queue_depth, struct request *flush_rq) { int i; unsigned long flags; /* The hw queue may not be mapped yet */ if (!tags) return; WARN_ON_ONCE(req_ref_read(flush_rq) != 0); for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); /* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ spin_lock_irqsave(&tags->lock, flags); spin_unlock_irqrestore(&tags->lock, flags); } /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct request *flush_rq = hctx->fq->flush_rq; if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); if (blk_queue_init_done(q)) blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], set->queue_depth, flush_rq); if (set->ops->exit_request) set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); xa_erase(&q->hctx_table, hctx_idx); spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); } static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; blk_mq_remove_cpuhp(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { hctx->queue_num = hctx_idx; hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto fail; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) goto exit_flush_rq; return 0; exit_flush_rq: if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); fail: return -1; } static struct blk_mq_hw_ctx * blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, int node) { struct blk_mq_hw_ctx *hctx; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); if (!hctx) goto fail_alloc_hctx; if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) goto free_hctx; atomic_set(&hctx->nr_active, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); INIT_HLIST_NODE(&hctx->cpuhp_dead); INIT_HLIST_NODE(&hctx->cpuhp_online); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; INIT_LIST_HEAD(&hctx->hctx_list); /* * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), gfp, node); if (!hctx->ctxs) goto free_cpumask; if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), gfp, node, false, false)) goto free_ctxs; hctx->nr_ctx = 0; spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) goto free_bitmap; blk_mq_hctx_kobj_init(hctx); return hctx; free_bitmap: sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); free_cpumask: free_cpumask_var(hctx->cpumask); free_hctx: kfree(hctx); fail_alloc_hctx: return NULL; } static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { struct blk_mq_tag_set *set = q->tag_set; unsigned int i, j; for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; int k; __ctx->cpu = i; spin_lock_init(&__ctx->lock); for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) INIT_LIST_HEAD(&__ctx->rq_lists[k]); __ctx->queue = q; /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ for (j = 0; j < set->nr_maps; j++) { hctx = blk_mq_map_queue_type(q, j, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = cpu_to_node(i); } } } struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth) { struct blk_mq_tags *tags; int ret; tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); if (!tags) return NULL; ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { blk_mq_free_rq_map(tags); return NULL; } return tags; } static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, int hctx_idx) { if (blk_mq_is_shared_tags(set->flags)) { set->tags[hctx_idx] = set->shared_tags; return true; } set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, set->queue_depth); return set->tags[hctx_idx]; } void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); blk_mq_free_rq_map(tags); } } static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx) { if (!blk_mq_is_shared_tags(set->flags)) blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); set->tags[hctx_idx] = NULL; } static void blk_mq_map_swqueue(struct request_queue *q) { unsigned int j, hctx_idx; unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; hctx->dispatch_from = NULL; } /* * Map software to hardware queues. * * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { ctx = per_cpu_ptr(q->queue_ctx, i); for (j = 0; j < set->nr_maps; j++) { if (!set->map[j].nr_queues) { ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); continue; } hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ set->map[j].mq_map[i] = 0; } hctx = blk_mq_map_queue_type(q, j, i); ctx->hctxs[j] = hctx; /* * If the CPU is already set in the mask, then we've * mapped this one already. This can happen if * devices share queues across queue maps. */ if (cpumask_test_cpu(i, hctx->cpumask)) continue; cpumask_set_cpu(i, hctx->cpumask); hctx->type = j; ctx->index_hw[hctx->type] = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; /* * If the nr_ctx type overflows, we have exceeded the * amount of sw queues we can support. */ BUG_ON(!hctx->nr_ctx); } for (; j < HCTX_MAX_TYPES; j++) ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); } queue_for_each_hw_ctx(q, hctx, i) { int cpu; /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. */ if (!hctx->nr_ctx) { /* Never unmap queue 0. We need it as a * fallback in case of a new remap fails * allocation */ if (i) __blk_mq_free_map_and_rqs(set, i); hctx->tags = NULL; continue; } hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Rule out isolated CPUs from hctx->cpumask to avoid * running block kworker on isolated CPUs */ for_each_cpu(cpu, hctx->cpumask) { if (cpu_is_isolated(cpu)) cpumask_clear_cpu(cpu, hctx->cpumask); } /* * Initialize batch roundrobin counts */ hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } /* * Caller needs to ensure that we're either frozen/quiesced, or that * the queue isn't live yet. */ static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (shared) { hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; } else { blk_mq_tag_idle(hctx); hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; } } } static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, bool shared) { struct request_queue *q; unsigned int memflags; lockdep_assert_held(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { memflags = blk_mq_freeze_queue(q); queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q, memflags); } } static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, false); } mutex_unlock(&set->tag_list_lock); INIT_LIST_HEAD(&q->tag_set_list); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *q) { mutex_lock(&set->tag_list_lock); /* * Check to see if we're transitioning to shared (from 1 to 2 queues). */ if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, true); } if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); } /* All allocations will be freed in release handler of q->mq_kobj */ static int blk_mq_alloc_ctxs(struct request_queue *q) { struct blk_mq_ctxs *ctxs; int cpu; ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); if (!ctxs) return -ENOMEM; ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!ctxs->queue_ctx) goto fail; for_each_possible_cpu(cpu) { struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); ctx->ctxs = ctxs; } q->mq_kobj = &ctxs->kobj; q->queue_ctx = ctxs->queue_ctx; return 0; fail: kfree(ctxs); return -ENOMEM; } /* * It is the actual release handler for mq, but we do it from * request queue's release handler for avoiding use-after-free * and headache because q->mq_kobj shouldn't have been introduced, * but we can't group ctx/kctx kobj without it. */ void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); /* all hctx are in .unused_hctx_list now */ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { list_del_init(&hctx->hctx_list); kobject_put(&hctx->kobj); } xa_destroy(&q->hctx_table); /* * release .mq_kobj and sw queue's kobject now because * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); } struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata) { struct queue_limits default_lim = { }; struct request_queue *q; int ret; if (!lim) lim = &default_lim; lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; if (set->nr_maps > HCTX_TYPE_POLL) lim->features |= BLK_FEAT_POLL; q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; ret = blk_mq_init_allocated_queue(set, q); if (ret) { blk_put_queue(q); return ERR_PTR(ret); } return q; } EXPORT_SYMBOL(blk_mq_alloc_queue); /** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). * * Context: can sleep */ void blk_mq_destroy_queue(struct request_queue *q) { WARN_ON_ONCE(!queue_is_mq(q)); WARN_ON_ONCE(blk_queue_registered(q)); might_sleep(); blk_queue_flag_set(QUEUE_FLAG_DYING, q); blk_queue_start_drain(q); blk_mq_freeze_queue_wait(q); blk_sync_queue(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); } EXPORT_SYMBOL(blk_mq_destroy_queue); struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; q = blk_mq_alloc_queue(set, lim, queuedata); if (IS_ERR(q)) return ERR_CAST(q); disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_mq_destroy_queue(q); blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass) { struct gendisk *disk; if (!blk_get_queue(q)) return NULL; disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass); if (!disk) blk_put_queue(q); return disk; } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); /* * Only hctx removed from cpuhp list can be reused */ static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) { return hlist_unhashed(&hctx->cpuhp_online) && hlist_unhashed(&hctx->cpuhp_dead); } static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) { struct blk_mq_hw_ctx *hctx = NULL, *tmp; /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { hctx = tmp; break; } } if (hctx) list_del_init(&hctx->hctx_list); spin_unlock(&q->unused_hctx_lock); if (!hctx) hctx = blk_mq_alloc_hctx(q, set, node); if (!hctx) goto fail; if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) goto free_hctx; return hctx; free_hctx: kobject_put(&hctx->kobj); fail: return NULL; } static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i, j; for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); WARN_ON_ONCE(!hctx); } } /* * Increasing nr_hw_queues fails. Free the newly allocated * hctxs and keep the previous q->nr_hw_queues. */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; } else { j = i; q->nr_hw_queues = set->nr_hw_queues; } xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { __blk_mq_realloc_hw_ctxs(set, q); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); /* register cpuhp for new initialized hctxs */ blk_mq_add_hw_queues_cpuhp(q); } int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; /* * ->tag_set has to be setup before initialize hctx, which cpuphp * handler needs it for checking queue mapping */ q->tag_set = set; if (blk_mq_alloc_ctxs(q)) goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); xa_init(&q->hctx_table); blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_map_swqueue(q); blk_mq_add_queue_tag_set(set, q); return 0; err_hctxs: blk_mq_release(q); err_exit: q->mq_ops = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); /* tags can _not_ be used after returning from blk_mq_exit_queue */ void blk_mq_exit_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ blk_mq_del_queue_tag_set(q); } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; if (blk_mq_is_shared_tags(set->flags)) { set->shared_tags = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, set->queue_depth); if (!set->shared_tags) return -ENOMEM; } for (i = 0; i < set->nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; cond_resched(); } return 0; out_unwind: while (--i >= 0) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } return -ENOMEM; } /* * Allocate the request maps associated with this tag_set. Note that this * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; depth = set->queue_depth; do { err = __blk_mq_alloc_rq_maps(set); if (!err) break; set->queue_depth >>= 1; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { err = -ENOMEM; break; } } while (set->queue_depth); if (!set->queue_depth || err) { pr_err("blk-mq: failed to allocate request map\n"); return -ENOMEM; } if (depth != set->queue_depth) pr_info("blk-mq: reduced tag depth (%u -> %u)\n", depth, set->queue_depth); return 0; } static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) { /* * blk_mq_map_queues() and multiple .map_queues() implementations * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the * number of hardware queues. */ if (set->nr_maps == 1) set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; if (set->ops->map_queues) { int i; /* * transport .map_queues is usually done in the following * way: * * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, int new_nr_hw_queues) { struct blk_mq_tags **new_tags; int i; if (set->nr_hw_queues >= new_nr_hw_queues) goto done; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!new_tags) return -ENOMEM; if (set->tags) memcpy(new_tags, set->tags, set->nr_hw_queues * sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) { while (--i >= set->nr_hw_queues) __blk_mq_free_map_and_rqs(set, i); return -ENOMEM; } cond_resched(); } done: set->nr_hw_queues = new_nr_hw_queues; return 0; } /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) return -EINVAL; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; if (!set->ops->queue_rq) return -EINVAL; if (!set->ops->get_budget ^ !set->ops->put_budget) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH); set->queue_depth = BLK_MQ_MAX_DEPTH; } if (!set->nr_maps) set->nr_maps = 1; else if (set->nr_maps > HCTX_MAX_TYPES) return -EINVAL; /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 64 tags to prevent * using too much memory. */ if (is_kdump_kernel()) set->queue_depth = min(64U, set->queue_depth); /* * There is no use for more h/w queues than cpus if we just have * a single map */ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; if (set->flags & BLK_MQ_F_BLOCKING) { set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); if (!set->srcu) return -ENOMEM; ret = init_srcu_struct(set->srcu); if (ret) goto out_free_srcu; } init_rwsem(&set->update_nr_hwq_lock); ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) goto out_cleanup_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]), GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map; set->map[i].nr_queues = set->nr_hw_queues; } blk_mq_update_queue_map(set); ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); set->map[i].mq_map = NULL; } kfree(set->tags); set->tags = NULL; out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); out_free_srcu: if (set->flags & BLK_MQ_F_BLOCKING) kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); /* allocate and initialize a tagset for a simple single-queue device */ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags) { memset(set, 0, sizeof(*set)); set->ops = ops; set->nr_hw_queues = 1; set->nr_maps = 1; set->queue_depth = queue_depth; set->numa_node = NUMA_NO_NODE; set->flags = set_flags; return blk_mq_alloc_tag_set(set); } EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; for (i = 0; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); set->map[j].mq_map = NULL; } kfree(set->tags); set->tags = NULL; if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); } } EXPORT_SYMBOL(blk_mq_free_tag_set); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; int ret; unsigned long i; if (WARN_ON_ONCE(!q->mq_freeze_depth)) return -EINVAL; if (!set) return -EINVAL; if (q->nr_requests == nr) return 0; blk_mq_quiesce_queue(q); ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; /* * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } if (!ret) { q->nr_requests = nr; if (blk_mq_is_shared_tags(set->flags)) { if (q->elevator) blk_mq_tag_update_sched_shared_tags(q); else blk_mq_tag_resize_shared_tags(set, nr); } } blk_mq_unquiesce_queue(q); return ret; } /* * Switch back to the elevator type stored in the xarray. */ static void blk_mq_elv_switch_back(struct request_queue *q, struct xarray *elv_tbl, struct xarray *et_tbl) { struct elevator_type *e = xa_load(elv_tbl, q->id); struct elevator_tags *t = xa_load(et_tbl, q->id); /* The elv_update_nr_hw_queues unfreezes the queue. */ elv_update_nr_hw_queues(q, e, t); /* Drop the reference acquired in blk_mq_elv_switch_none. */ if (e) elevator_put(e); } /* * Stores elevator type in xarray and set current elevator to none. It uses * q->id as an index to store the elevator type into the xarray. */ static int blk_mq_elv_switch_none(struct request_queue *q, struct xarray *elv_tbl) { int ret = 0; lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); /* * Accessing q->elevator without holding q->elevator_lock is safe here * because we're called from nr_hw_queue update which is protected by * set->update_nr_hwq_lock in the writer context. So, scheduler update/ * switch code (which acquires the same lock in the reader context) * can't run concurrently. */ if (q->elevator) { ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); if (WARN_ON_ONCE(ret)) return ret; /* * Before we switch elevator to 'none', take a reference to * the elevator module so that while nr_hw_queue update is * running, no one can remove elevator module. We'd put the * reference to elevator module later when we switch back * elevator. */ __elevator_get(q->elevator->type); elevator_set_none(q); } return ret; } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; struct xarray elv_tbl, et_tbl; bool queues_frozen = false; lockdep_assert_held(&set->tag_list_lock); if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) nr_hw_queues = nr_cpu_ids; if (nr_hw_queues < 1) return; if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; memflags = memalloc_noio_save(); xa_init(&et_tbl); if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) goto out_memalloc_restore; xa_init(&elv_tbl); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); } /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done * updating the new sw to hw queue mappings. */ list_for_each_entry(q, &set->tag_list, tag_set_list) if (blk_mq_elv_switch_none(q, &elv_tbl)) goto switch_back; list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue_nomemsave(q); queues_frozen = true; if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto switch_back; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { __blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); for (; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } blk_mq_map_swqueue(q); } switch_back: /* The blk_mq_elv_switch_back unfreezes queue for us. */ list_for_each_entry(q, &set->tag_list, tag_set_list) { /* switch_back expects queue to be frozen */ if (!queues_frozen) blk_mq_freeze_queue_nomemsave(q); blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); } list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); blk_mq_remove_hw_queues_cpuhp(q); blk_mq_add_hw_queues_cpuhp(q); } xa_destroy(&elv_tbl); xa_destroy(&et_tbl); out_memalloc_restore: memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { down_write(&set->update_nr_hwq_lock); mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); up_write(&set->update_nr_hwq_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags) { long state = get_current_state(); int ret; do { ret = q->mq_ops->poll(hctx, iob); if (ret > 0) { __set_current_state(TASK_RUNNING); return ret; } if (signal_pending_state(state, current)) __set_current_state(TASK_RUNNING); if (task_is_running(current)) return 1; if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; cpu_relax(); } while (!need_resched()); __set_current_state(TASK_RUNNING); return 0; } int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { if (!blk_mq_can_poll(q)) return 0; return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, unsigned int poll_flags) { struct request_queue *q = rq->q; int ret; if (!blk_rq_is_poll(rq)) return 0; if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); blk_queue_exit(q); return ret; } EXPORT_SYMBOL_GPL(blk_rq_poll); unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; } EXPORT_SYMBOL(blk_mq_rq_cpu); void blk_mq_cancel_work_sync(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } static int __init blk_mq_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(blk_cpu_done, i)); for_each_possible_cpu(i) INIT_CSD(&per_cpu(blk_cpu_csd, i), __blk_mq_complete_request_remote, NULL); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, "block/softirq:dead", NULL, blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", blk_mq_hctx_notify_online, blk_mq_hctx_notify_offline); return 0; } subsys_initcall(blk_mq_init);
45 186 179 184 183 166 166 123 36 124 123 114 897 62 75 74 126 126 126 126 126 43 27 2 14 16 16 2 126 126 1 2 3 1 62 124 62 26 166 166 165 66 26 23 23 168 1 167 69 46 125 2 163 66 166 166 163 66 166 165 1 167 164 70 72 241 158 161 42 184 319 51 322 322 312 42 13 320 212 2 179 179 11 179 177 41 188 18 162 59 70 70 15 66 5 190 190 164 191 45 187 1 187 187 353 352 353 353 2 187 177 128 25 6 70 187 213 212 4 211 199 351 351 351 34 322 13 320 313 16 242 2 242 242 53 43 46 1 12 38 23 14 23 23 22 11 3 3 45 7 3 13 27 40 2092 1661 820 45 827 44 44 23 23 23 26 25 1468 1419 178 39 27 835 836 834 1332 1333 1332 1333 1330 1332 1 1227 1227 3310 3313 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 // SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju * Jim Keniston * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> #include <linux/highmem.h> #include <linux/pagemap.h> /* read_mapping_page */ #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> #include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> #include <linux/rcupdate_trace.h> #include <linux/workqueue.h> #include <linux/srcu.h> #include <linux/oom.h> /* check_stable_address_space */ #include <linux/pagewalk.h> #include <linux/uprobes.h> #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; /* * allows us to skip the uprobe_mmap if there are no uprobe events active * at this time. Probably a fine grained per inode count is better? */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Covers return_instance's uprobe lifetime. */ DEFINE_STATIC_SRCU(uretprobes_srcu); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ union { struct rcu_head rcu; struct work_struct work; }; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; /* "unsigned long" so bitops work */ /* * The generic code assumes that it has two members of unknown type * owned by the arch-specific code: * * insn - copy_insn() saves the original instruction here for * arch_uprobe_analyze_insn(). * * ixol - potentially modified instruction to execute out of * line, copied to xol_area by xol_get_insn_slot(). */ struct arch_uprobe arch; }; struct delayed_uprobe { struct list_head list; struct uprobe *uprobe; struct mm_struct *mm; }; static DEFINE_MUTEX(delayed_uprobe_lock); static LIST_HEAD(delayed_uprobe_list); /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction * mangled by set_swbp(). * * On a breakpoint hit, thread contests for a slot. It frees the * slot after singlestep. Currently a fixed number of slots are * allocated. */ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ unsigned long *bitmap; /* 0 = free slot */ struct page *page; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ unsigned long vaddr; /* Page(s) of instruction slots */ }; static void uprobe_warn(struct task_struct *t, const char *msg) { pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); } /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have * changed after breakpoint was inserted. * - is_register: indicates if we are in register context. * - Return 1 if the specified virtual address is in an * executable vma. */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC; } static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) { return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT); } static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) { return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); } /** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn * Returns true if @insn is a breakpoint instruction. */ bool __weak is_swbp_insn(uprobe_opcode_t *insn) { return *insn == UPROBE_SWBP_INSN; } /** * is_trap_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_trap_insn * Returns true if @insn is a breakpoint instruction. * * This function is needed for the case where an architecture has multiple * trap instructions (like powerpc). */ bool __weak is_trap_insn(uprobe_opcode_t *insn) { return is_swbp_insn(insn); } static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); kunmap_atomic(kaddr); } static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) { void *kaddr = kmap_atomic(page); memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) { uprobe_opcode_t old_opcode; bool is_swbp; /* * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. * We do not check if it is any other 'trap variant' which could * be conditional trap instruction such as the one powerpc supports. * * The logic is that we do not care if the underlying instruction * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { if (is_swbp) /* register: already installed? */ return 0; } else { if (!is_swbp) /* unregister: was it changed by us? */ return 0; } return 1; } static struct delayed_uprobe * delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; list_for_each_entry(du, &delayed_uprobe_list, list) if (du->uprobe == uprobe && du->mm == mm) return du; return NULL; } static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; if (delayed_uprobe_check(uprobe, mm)) return 0; du = kzalloc(sizeof(*du), GFP_KERNEL); if (!du) return -ENOMEM; du->uprobe = uprobe; du->mm = mm; list_add(&du->list, &delayed_uprobe_list); return 0; } static void delayed_uprobe_delete(struct delayed_uprobe *du) { if (WARN_ON(!du)) return; list_del(&du->list); kfree(du); } static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) { struct list_head *pos, *q; struct delayed_uprobe *du; if (!uprobe && !mm) return; list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (uprobe && du->uprobe != uprobe) continue; if (mm && du->mm != mm) continue; delayed_uprobe_delete(du); } } static bool valid_ref_ctr_vma(struct uprobe *uprobe, struct vm_area_struct *vma) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); return uprobe->ref_ctr_offset && vma->vm_file && file_inode(vma->vm_file) == uprobe->inode && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && vma->vm_start <= vaddr && vma->vm_end > vaddr; } static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; return NULL; } static int __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; int ret; short *ptr; if (!vaddr || !d) return -EINVAL; ret = get_user_pages_remote(mm, vaddr, 1, FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, * it may return 0, in that case we have to return error. */ return ret == 0 ? -EBUSY : ret; } kaddr = kmap_atomic(page); ptr = kaddr + (vaddr & ~PAGE_MASK); if (unlikely(*ptr + d < 0)) { pr_warn("ref_ctr going negative. vaddr: 0x%lx, " "curr val: %d, delta: %d\n", vaddr, *ptr, d); ret = -EINVAL; goto out; } *ptr += d; ret = 0; out: kunmap_atomic(kaddr); put_page(page); return ret; } static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); } static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, short d) { struct vm_area_struct *rc_vma; unsigned long rc_vaddr; int ret = 0; rc_vma = find_ref_ctr_vma(uprobe, mm); if (rc_vma) { rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); ret = __update_ref_ctr(mm, rc_vaddr, d); if (ret) update_ref_ctr_warn(uprobe, mm, d); if (d > 0) return ret; } mutex_lock(&delayed_uprobe_lock); if (d > 0) ret = delayed_uprobe_add(uprobe, mm); else delayed_uprobe_remove(uprobe, mm); mutex_unlock(&delayed_uprobe_lock); return ret; } static bool orig_page_is_identical(struct vm_area_struct *vma, unsigned long vaddr, struct page *page, bool *pmd_mappable) { const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, index); struct page *orig_page; bool identical; if (IS_ERR(orig_folio)) return false; orig_page = folio_file_page(orig_folio, index); *pmd_mappable = folio_test_pmd_mappable(orig_folio); identical = folio_test_uptodate(orig_folio) && pages_identical(page, orig_page); folio_put(orig_folio); return identical; } static int __uprobe_write_opcode(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, unsigned long opcode_vaddr, uprobe_opcode_t opcode) { const unsigned long vaddr = opcode_vaddr & PAGE_MASK; const bool is_register = !!is_swbp_insn(&opcode); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ if (fw->level != FW_LEVEL_PTE) return -EFAULT; /* * See can_follow_write_pte(): we'd actually prefer a writable PTE here, * but the VMA might not be writable. */ if (!pte_write(fw->pte)) { if (!PageAnonExclusive(fw->page)) return -EFAULT; if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) return -EFAULT; /* SOFTDIRTY is handled via pte_mkdirty() below. */ } /* * We'll temporarily unmap the page and flush the TLB, such that we can * modify the page atomically. */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); /* * When unregistering, we may only zap a PTE if uffd is disabled and * there are no unexpected folio references ... */ if (is_register || userfaultfd_missing(vma) || (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1)) goto remap; /* * ... and the mapped page is identical to the original page that * would get faulted in on next access. */ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) goto remap; dec_mm_counter(vma->vm_mm, MM_ANONPAGES); folio_remove_rmap_pte(folio, fw->page, vma); if (!folio_mapped(folio) && folio_test_swapcache(folio) && folio_trylock(folio)) { folio_free_swap(folio); folio_unlock(folio); } folio_put(folio); return pmd_mappable; remap: /* * Make sure that our copy_to_page() changes become visible before the * set_pte_at() write. */ smp_wmb(); /* We modified the page. Make sure to mark the PTE dirty. */ set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); return 0; } /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. * @vma: the probed virtual memory area. * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { const unsigned long vaddr = opcode_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; int ret, is_register, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) return -EINVAL; /* * When registering, we have to break COW to get an exclusive anonymous * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() * cannot deal with PMDs yet. */ if (is_register) gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; retry: ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) goto out; folio = page_folio(page); ret = verify_opcode(page, opcode_vaddr, &opcode); if (ret <= 0) { folio_put(folio); goto out; } /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); goto out; } ref_ctr_updated = 1; } ret = 0; if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) { VM_WARN_ON_ONCE(is_register); folio_put(folio); goto out; } if (!is_register) { /* * In the common case, we'll be able to zap the page when * unregistering. So trigger MMU notifiers now, as we won't * be able to do it under PTL. */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vaddr, vaddr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); } ret = -EAGAIN; /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); folio_walk_end(&fw, vma); } if (!is_register) mmu_notifier_invalidate_range_end(&range); folio_put(folio); switch (ret) { case -EFAULT: gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; fallthrough; case -EAGAIN: goto retry; default: break; } out: /* Revert back reference counter if instruction update failed. */ if (ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ if (ret > 0) collapse_pte_mapped_thp(mm, vaddr, false); return ret < 0 ? ret : 0; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); } /** * set_orig_insn - Restore the original instruction. * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } /* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; } /* * uprobe should have guaranteed lifetime, which can be either of: * - caller already has refcount taken (and wants an extra one); * - uprobe is RCU protected and won't be freed until after grace period; * - we are holding uprobes_treelock (for read or write, doesn't matter). */ static struct uprobe *try_get_uprobe(struct uprobe *uprobe) { if (refcount_inc_not_zero(&uprobe->ref)) return uprobe; return NULL; } static inline bool uprobe_is_active(struct uprobe *uprobe) { return !RB_EMPTY_NODE(&uprobe->rb_node); } static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); kfree(uprobe); } static void uprobe_free_srcu(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); } static void uprobe_free_deferred(struct work_struct *work) { struct uprobe *uprobe = container_of(work, struct uprobe, work); write_lock(&uprobes_treelock); if (uprobe_is_active(uprobe)) { write_seqcount_begin(&uprobes_seqcount); rb_erase(&uprobe->rb_node, &uprobes_tree); write_seqcount_end(&uprobes_seqcount); } write_unlock(&uprobes_treelock); /* * If application munmap(exec_vma) before uprobe_unregister() * gets called, we don't get a chance to remove uprobe from * delayed_uprobe_list from remove_breakpoint(). Do it here. */ mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); /* start srcu -> rcu_tasks_trace -> kfree chain */ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); } static void put_uprobe(struct uprobe *uprobe) { if (!refcount_dec_and_test(&uprobe->ref)) return; INIT_WORK(&uprobe->work, uprobe_free_deferred); schedule_work(&uprobe->work); } /* Initialize hprobe as SRCU-protected "leased" uprobe */ static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) { WARN_ON(!uprobe); hprobe->state = HPROBE_LEASED; hprobe->uprobe = uprobe; hprobe->srcu_idx = srcu_idx; } /* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) { hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; hprobe->uprobe = uprobe; hprobe->srcu_idx = -1; } /* * hprobe_consume() fetches hprobe's underlying uprobe and detects whether * uprobe is SRCU protected or is refcounted. hprobe_consume() can be * used only once for a given hprobe. * * Caller has to call hprobe_finalize() and pass previous hprobe_state, so * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever * is appropriate. */ static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) { *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); switch (*hstate) { case HPROBE_LEASED: case HPROBE_STABLE: return hprobe->uprobe; case HPROBE_GONE: /* uprobe is NULL, no SRCU */ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ return NULL; default: WARN(1, "hprobe invalid state %d", *hstate); return NULL; } } /* * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. * hprobe_finalize() can only be used from current context after * hprobe_consume() call (which determines uprobe and hstate value). */ static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) { switch (hstate) { case HPROBE_LEASED: __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); break; case HPROBE_STABLE: put_uprobe(hprobe->uprobe); break; case HPROBE_GONE: case HPROBE_CONSUMED: break; default: WARN(1, "hprobe invalid state %d", hstate); break; } } /* * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of * them can win the race to perform SRCU unlocking. Whoever wins must perform * SRCU unlock. * * Returns underlying valid uprobe or NULL, if there was no underlying uprobe * to begin with or we failed to bump its refcount and it's going away. * * Returned non-NULL uprobe can be still safely used within an ongoing SRCU * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has * an extra refcount for caller to assume and use. Otherwise, it's not * guaranteed that returned uprobe has a positive refcount, so caller has to * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current * SRCU lock region. See dup_utask(). */ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) { enum hprobe_state hstate; /* * Caller should guarantee that return_instance is not going to be * freed from under us. This can be achieved either through holding * rcu_read_lock() or by owning return_instance in the first place. * * Underlying uprobe is itself protected from reuse by SRCU, so ensure * SRCU lock is held properly. */ lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { case HPROBE_STABLE: /* uprobe has positive refcount, bump refcount, if necessary */ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; case HPROBE_GONE: /* * SRCU was unlocked earlier and we didn't manage to take * uprobe refcnt, so it's effectively NULL */ return NULL; case HPROBE_CONSUMED: /* * uprobe was consumed, so it's effectively NULL as far as * uretprobe processing logic is concerned */ return NULL; case HPROBE_LEASED: { struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); /* * Try to switch hprobe state, guarding against * hprobe_consume() or another hprobe_expire() racing with us. * Note, if we failed to get uprobe refcount, we use special * HPROBE_GONE state to signal that hprobe->uprobe shouldn't * be used as it will be freed after SRCU is unlocked. */ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { /* We won the race, we are the ones to unlock SRCU */ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); return get ? get_uprobe(uprobe) : uprobe; } /* * We lost the race, undo refcount bump (if it ever happened), * unless caller would like an extra refcount anyways. */ if (uprobe && !get) put_uprobe(uprobe); /* * Even if hprobe_consume() or another hprobe_expire() wins * the state update race and unlocks SRCU from under us, we * still have a guarantee that underyling uprobe won't be * freed due to ongoing caller's SRCU lock region, so we can * return it regardless. Also, if `get` was true, we also have * an extra ref for the caller to own. This is used in dup_utask(). */ return uprobe; } default: WARN(1, "unknown hprobe state %d", hstate); return NULL; } } static __always_inline int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, const struct uprobe *r) { if (l_inode < r->inode) return -1; if (l_inode > r->inode) return 1; if (l_offset < r->offset) return -1; if (l_offset > r->offset) return 1; return 0; } #define __node_2_uprobe(node) \ rb_entry((node), struct uprobe, rb_node) struct __uprobe_key { struct inode *inode; loff_t offset; }; static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) { const struct __uprobe_key *a = key; return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); } static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) { struct uprobe *u = __node_2_uprobe(a); return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } /* * Assumes being inside RCU protected region. * No refcount is taken on returned uprobe. */ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { struct __uprobe_key key = { .inode = inode, .offset = offset, }; struct rb_node *node; unsigned int seq; lockdep_assert(rcu_read_lock_trace_held()); do { seq = read_seqcount_begin(&uprobes_seqcount); node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); /* * Lockless RB-tree lookups can result only in false negatives. * If the element is found, it is correct and can be returned * under RCU protection. If we find nothing, we need to * validate that seqcount didn't change. If it did, we have to * try again as we might have missed the element (false * negative). If seqcount is unchanged, search truly failed. */ if (node) return __node_2_uprobe(node); } while (read_seqcount_retry(&uprobes_seqcount, seq)); return NULL; } /* * Attempt to insert a new uprobe into uprobes_tree. * * If uprobe already exists (for given inode+offset), we just increment * refcount of previously existing uprobe. * * If not, a provided new instance of uprobe is inserted into the tree (with * assumed initial refcount == 1). * * In any case, we return a uprobe instance that ends up being in uprobes_tree. * Caller has to clean up new uprobe instance, if it ended up not being * inserted into the tree. * * We assume that uprobes_treelock is held for writing. */ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; again: node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); if (node) { struct uprobe *u = __node_2_uprobe(node); if (!try_get_uprobe(u)) { rb_erase(node, &uprobes_tree); RB_CLEAR_NODE(&u->rb_node); goto again; } return u; } return uprobe; } /* * Acquire uprobes_treelock and insert uprobe into uprobes_tree * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; write_lock(&uprobes_treelock); write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); return u; } static void ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) { pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) cur_uprobe->ref_ctr_offset, (unsigned long long) uprobe->ref_ctr_offset); } static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) return ERR_PTR(-ENOMEM); uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); RB_CLEAR_NODE(&uprobe->rb_node); refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); kfree(uprobe); return ERR_PTR(-EINVAL); } kfree(uprobe); uprobe = cur_uprobe; } return uprobe; } static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { static atomic64_t id; down_write(&uprobe->consumer_rwsem); list_add_rcu(&uc->cons_node, &uprobe->consumers); uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. * Should never be called with consumer that's not part of @uprobe->consumers. */ static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); } static int __copy_insn(struct address_space *mapping, struct file *filp, void *insn, int nbytes, loff_t offset) { struct page *page; /* * Ensure that the page that has the original instruction is populated * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; } static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping = uprobe->inode->i_mapping; loff_t offs = uprobe->offset; void *insn = &uprobe->arch.insn; int size = sizeof(uprobe->arch.insn); int len, err = -EIO; /* Copy only available bytes, -EIO if nothing was read */ do { if (offs >= i_size_read(uprobe->inode)) break; len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); err = __copy_insn(mapping, filp, insn, len, offs); if (err) break; insn += len; offs += len; size -= len; } while (size); return err; } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, struct mm_struct *mm, unsigned long vaddr) { int ret = 0; if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; /* TODO: move this into _register, until then we abuse this sem. */ down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; ret = copy_insn(uprobe, file); if (ret) goto out; ret = -ENOTSUPP; if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) goto out; smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: up_write(&uprobe->consumer_rwsem); return ret; } static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { return !uc->filter || uc->filter(uc, mm); } static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { ret = consumer_filter(uc, mm); if (ret) break; } up_read(&uprobe->consumer_rwsem); return ret; } static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long vaddr) { struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; /* * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; } static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long vaddr) { struct mm_struct *mm = vma->vm_mm; set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, vma, vaddr); } struct map_info { struct map_info *next; struct mm_struct *mm; unsigned long vaddr; }; static inline struct map_info *free_map_info(struct map_info *info) { struct map_info *next = info->next; kfree(info); return next; } static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; struct map_info *info; int more = 0; again: i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); if (prev) prev->next = NULL; } if (!prev) { more++; continue; } if (!mmget_not_zero(vma->vm_mm)) continue; info = prev; prev = prev->next; info->next = curr; curr = info; info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } i_mmap_unlock_read(mapping); if (!more) goto out; prev = curr; while (curr) { mmput(curr->mm); curr = curr->next; } do { info = kmalloc(sizeof(struct map_info), GFP_KERNEL); if (!info) { curr = ERR_PTR(-ENOMEM); goto out; } info->next = prev; prev = info; } while (--more); goto again; out: while (prev) prev = free_map_info(prev); return curr; } static int register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { bool is_register = !!new; struct map_info *info; int err = 0; percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, uprobe->offset, is_register); if (IS_ERR(info)) { err = PTR_ERR(info); goto out; } while (info) { struct mm_struct *mm = info->mm; struct vm_area_struct *vma; if (err && is_register) goto free; /* * We take mmap_lock for writing to avoid the race with * find_active_uprobe_rcu() which takes mmap_lock for reading. * Thus this install_breakpoint() can not make * is_trap_at_addr() true right after find_uprobe_rcu() * returns NULL in find_active_uprobe_rcu(). */ mmap_write_lock(mm); if (check_stable_address_space(mm)) goto unlock; vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); } out: percpu_up_write(&dup_mmap_sem); return err; } /** * uprobe_unregister_nosync - unregister an already registered probe. * @uprobe: uprobe to remove * @uc: identify which probe if multiple probes are colocated. */ void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; down_write(&uprobe->register_rwsem); consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); up_write(&uprobe->register_rwsem); /* TODO : cant unregister? schedule a worker thread */ if (unlikely(err)) { uprobe_warn(current, "unregister, leaking uprobe"); return; } put_uprobe(uprobe); } EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); void uprobe_unregister_sync(void) { /* * Now that handler_chain() and handle_uretprobe_chain() iterate over * uprobe->consumers list under RCU protection without holding * uprobe->register_rwsem, we need to wait for RCU grace period to * make sure that we can't call into just unregistered * uprobe_consumer's callbacks anymore. If we don't do that, fast and * unlucky enough caller can free consumer's memory and cause * handler_chain() or handle_uretprobe_chain() to do an use-after-free. */ synchronize_rcu_tasks_trace(); synchronize_srcu(&uretprobes_srcu); } EXPORT_SYMBOL_GPL(uprobe_unregister_sync); /** * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return ERR_PTR(-EINVAL); /* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) return ERR_PTR(-EINVAL); uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (IS_ERR(uprobe)) return uprobe; down_write(&uprobe->register_rwsem); consumer_add(uprobe, uc); ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); if (ret) { uprobe_unregister_nosync(uprobe, uc); /* * Registration might have partially succeeded, so we can have * this consumer being called right at this time. We need to * sync here. It's ok, it's unlikely slow path. */ uprobe_unregister_sync(); return ERR_PTR(ret); } return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); /** * uprobe_apply - add or remove the breakpoints according to @uc->filter * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints * Return: 0 on success or negative error code. */ int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { struct uprobe_consumer *con; int ret = -ENOENT; down_write(&uprobe->register_rwsem); rcu_read_lock_trace(); list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { if (con == uc) { ret = register_for_each_vma(uprobe, add ? uc : NULL); break; } } rcu_read_unlock_trace(); up_write(&uprobe->register_rwsem); return ret; } static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; if (!valid_vma(vma, false) || file_inode(vma->vm_file) != uprobe->inode) continue; offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; if (uprobe->offset < offset || uprobe->offset >= offset + vma->vm_end - vma->vm_start) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, vma, vaddr); } mmap_read_unlock(mm); return err; } static struct rb_node * find_node_in_range(struct inode *inode, loff_t min, loff_t max) { struct rb_node *n = uprobes_tree.rb_node; while (n) { struct uprobe *u = rb_entry(n, struct uprobe, rb_node); if (inode < u->inode) { n = n->rb_left; } else if (inode > u->inode) { n = n->rb_right; } else { if (max < u->offset) n = n->rb_left; else if (min > u->offset) n = n->rb_right; else break; } } return n; } /* * For a given range in vma, build a list of probes that need to be inserted. */ static void build_probe_list(struct inode *inode, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *head) { loff_t min, max; struct rb_node *n, *t; struct uprobe *u; INIT_LIST_HEAD(head); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } } read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) { struct list_head *pos, *q; struct delayed_uprobe *du; unsigned long vaddr; int ret = 0, err = 0; mutex_lock(&delayed_uprobe_lock); list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (du->mm != vma->vm_mm || !valid_ref_ctr_vma(du->uprobe, vma)) continue; vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); if (ret) { update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); if (!err) err = ret; } delayed_uprobe_delete(du); } mutex_unlock(&delayed_uprobe_lock); return err; } /* * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. */ int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; if (no_uprobe_events()) return 0; if (vma->vm_file && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) delayed_ref_ctr_inc(vma); if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); if (!inode) return 0; mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); /* * We can race with uprobe_unregister(), this uprobe can be already * removed. But in this case filter_chain() must return false, all * consumers have gone away. */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); return 0; } static bool vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) { loff_t min, max; struct inode *inode; struct rb_node *n; inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); read_unlock(&uprobes_treelock); return !!n; } /* * Called in context of a munmap of a vma. */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) return; if (vma_has_uprobes(vma, start, end)) set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } static vm_fault_t xol_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { struct xol_area *area = vma->vm_mm->uprobes_state.xol_area; vmf->page = area->page; get_page(vmf->page); return 0; } static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { return -EPERM; } static const struct vm_special_mapping xol_mapping = { .name = "[uprobes]", .fault = xol_fault, .mremap = xol_mremap, }; /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { struct vm_area_struct *vma; int ret; if (mmap_write_lock_killable(mm)) return -EINTR; if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto fail; } ret = 0; /* pairs with get_xol_area() */ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: mmap_write_unlock(mm); return ret; } void * __weak arch_uprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; *psize = UPROBE_SWBP_INSN_SIZE; return &insn; } static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; unsigned long insns_size; struct xol_area *area; void *insns; area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long), GFP_KERNEL); if (!area->bitmap) goto free_area; area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!area->page) goto free_bitmap; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); insns = arch_uprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) return area; __free_page(area->page); free_bitmap: kfree(area->bitmap); free_area: kfree(area); out: return NULL; } /* * get_xol_area - Allocate process's xol_area if necessary. * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; struct xol_area *area; if (!mm->uprobes_state.xol_area) __create_xol_area(0); /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } /* * uprobe_clear_state - Free the area allocated for slots. */ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); if (!area) return; put_page(area->page); kfree(area->bitmap); kfree(area); } void uprobe_start_dup_mmap(void) { percpu_down_read(&dup_mmap_sem); } void uprobe_end_dup_mmap(void) { percpu_up_read(&dup_mmap_sem); } void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ set_bit(MMF_RECALC_UPROBES, &newmm->flags); } } static unsigned long xol_get_slot_nr(struct xol_area *area) { unsigned long slot_nr; slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); if (slot_nr < UINSNS_PER_PAGE) { if (!test_and_set_bit(slot_nr, area->bitmap)) return slot_nr; } return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. */ static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { struct xol_area *area = get_xol_area(); unsigned long slot_nr; if (!area) return false; wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return true; } /* * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ static void xol_free_insn_slot(struct uprobe_task *utask) { struct xol_area *area = current->mm->uprobes_state.xol_area; unsigned long offset = utask->xol_vaddr - area->vaddr; unsigned int slot_nr; utask->xol_vaddr = 0; /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; slot_nr = offset / UPROBE_XOL_SLOT_BYTES; clear_bit(slot_nr, area->bitmap); smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); } void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { /* Initialize the slot */ copy_to_page(page, vaddr, src, len); /* * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. */ flush_dcache_page(page); } /** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint * instruction. * Return the address of the breakpoint instruction. */ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) { return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } unsigned long uprobe_get_trap_addr(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (unlikely(utask && utask->active_uprobe)) return utask->vaddr; return instruction_pointer(regs); } static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) { ri->cons_cnt = 0; ri->next = utask->ri_pool; utask->ri_pool = ri; } static struct return_instance *ri_pool_pop(struct uprobe_task *utask) { struct return_instance *ri = utask->ri_pool; if (likely(ri)) utask->ri_pool = ri->next; return ri; } static void ri_free(struct return_instance *ri) { kfree(ri->extra_consumers); kfree_rcu(ri, rcu); } static void free_ret_instance(struct uprobe_task *utask, struct return_instance *ri, bool cleanup_hprobe) { unsigned seq; if (cleanup_hprobe) { enum hprobe_state hstate; (void)hprobe_consume(&ri->hprobe, &hstate); hprobe_finalize(&ri->hprobe, hstate); } /* * At this point return_instance is unlinked from utask's * return_instances list and this has become visible to ri_timer(). * If seqcount now indicates that ri_timer's return instance * processing loop isn't active, we can return ri into the pool of * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. * Admittedly, this is a rather simple use of seqcount, but it nicely * abstracts away all the necessary memory barriers, so we use * a well-supported kernel primitive here. */ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */ ri_pool_push(utask, ri); } else { /* we might be racing with ri_timer(), so play it safe */ ri_free(ri); } } /* * Called with no locks held. * Called in context of an exiting or an exec-ing thread. */ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; struct return_instance *ri, *ri_next; if (!utask) return; t->utask = NULL; WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; while (ri) { ri_next = ri->next; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } /* free_ret_instance() above might add to ri_pool, so this loop should come last */ ri = utask->ri_pool; while (ri) { ri_next = ri->next; ri_free(ri); ri = ri_next; } kfree(utask); } #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ #define for_each_ret_instance_rcu(pos, head) \ for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) static void ri_timer(struct timer_list *timer) { struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); struct return_instance *ri; /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ guard(srcu)(&uretprobes_srcu); /* RCU protects return_instance from freeing. */ guard(rcu)(); /* * See free_ret_instance() for notes on seqcount use. * We also employ raw API variants to avoid lockdep false-positive * warning complaining about enabled preemption. The timer can only be * invoked once for a uprobe_task. Therefore there can only be one * writer. The reader does not require an even sequence count to make * progress, so it is OK to remain preemptible on PREEMPT_RT. */ raw_write_seqcount_begin(&utask->ri_seqcount); for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); raw_write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) { struct uprobe_task *utask; utask = kzalloc(sizeof(*utask), GFP_KERNEL); if (!utask) return NULL; timer_setup(&utask->ri_timer, ri_timer, 0); seqcount_init(&utask->ri_seqcount); return utask; } /* * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ static struct uprobe_task *get_utask(void) { if (!current->utask) current->utask = alloc_utask(); return current->utask; } static struct return_instance *alloc_return_instance(struct uprobe_task *utask) { struct return_instance *ri; ri = ri_pool_pop(utask); if (ri) return ri; ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; return ri; } static struct return_instance *dup_return_instance(struct return_instance *old) { struct return_instance *ri; ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); if (!ri) return NULL; if (unlikely(old->cons_cnt > 1)) { ri->extra_consumers = kmemdup(old->extra_consumers, sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), GFP_KERNEL); if (!ri->extra_consumers) { kfree(ri); return NULL; } } return ri; } static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) { struct uprobe_task *n_utask; struct return_instance **p, *o, *n; struct uprobe *uprobe; n_utask = alloc_utask(); if (!n_utask) return -ENOMEM; t->utask = n_utask; /* protect uprobes from freeing, we'll need try_get_uprobe() them */ guard(srcu)(&uretprobes_srcu); p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) { n = dup_return_instance(o); if (!n) return -ENOMEM; /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ uprobe = hprobe_expire(&o->hprobe, true); /* * New utask will have stable properly refcounted uprobe or * NULL. Even if we failed to get refcounted uprobe, we still * need to preserve full set of return_instances for proper * uretprobe handling and nesting in forked task. */ hprobe_init_stable(&n->hprobe, uprobe); n->next = NULL; rcu_assign_pointer(*p, n); p = &n->next; n_utask->depth++; } return 0; } static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) return; if (!__create_xol_area(current->utask->dup_xol_addr) && !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } /* * Called in context of a new clone/fork from copy_process. */ void uprobe_copy_process(struct task_struct *t, unsigned long flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; struct xol_area *area; t->utask = NULL; if (!utask || !utask->return_instances) return; if (mm == t->mm && !(flags & CLONE_VFORK)) return; if (dup_utask(t, utask)) return uprobe_warn(t, "dup ret instances"); /* The task can fork() after dup_xol_work() fails */ area = mm->uprobes_state.xol_area; if (!area) return uprobe_warn(t, "dup xol area"); if (mm == t->mm) return; t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. * * Returns -1 in case the xol_area is not allocated. */ unsigned long uprobe_get_trampoline_vaddr(void) { unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; return trampoline_vaddr; } static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } } static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, struct return_instance *ri) { struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; bool chained; int srcu_idx; if (!get_xol_area()) goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); goto free; } trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) goto free; /* drop the entries invalidated by longjmp() */ chained = (orig_ret_vaddr == trampoline_vaddr); cleanup_return_instances(utask, chained, regs); /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space. */ uprobe_warn(current, "handle tail call"); goto free; } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } /* __srcu_read_lock() because SRCU lock survives switch to user space */ srcu_idx = __srcu_read_lock(&uretprobes_srcu); ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; rcu_assign_pointer(utask->return_instances, ri); mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); return; free: ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { struct uprobe_task *utask = current->utask; int err; if (!try_get_uprobe(uprobe)) return -EINVAL; if (!xol_get_insn_slot(uprobe, utask)) { err = -ENOMEM; goto err_out; } utask->vaddr = bp_vaddr; err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { xol_free_insn_slot(utask); goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; err_out: put_uprobe(uprobe); return err; } /* * If we are singlestepping, then ensure this thread is not connected to * non-fatal signals until completion of singlestep. When xol insn itself * triggers the signal, restart the original insn even if the task is * already SIGKILL'ed (since coredump should report the correct ip). This * is even more important if the task has a handler for SIGSEGV/etc, The * _same_ instruction should be repeated again after return from the signal * handler, and SSTEP can never finish in this case. */ bool uprobe_deny_signal(void) { struct task_struct *t = current; struct uprobe_task *utask = t->utask; if (likely(!utask || !utask->active_uprobe)) return false; WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; set_tsk_thread_flag(t, TIF_UPROBE); } } return true; } static void mmf_recalc_uprobes(struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; } clear_bit(MMF_HAS_UPROBES, &mm->flags); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; int result; if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) return -EINVAL; pagefault_disable(); result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) goto out; result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; struct file *vm_file; loff_t offset; unsigned int seq; guard(rcu)(); if (!mmap_lock_speculate_try_begin(mm, &seq)) return NULL; vma = vma_lookup(mm, bp_vaddr); if (!vma) return NULL; /* * vm_file memory can be reused for another instance of struct file, * but can't be freed from under us, so it's safe to read fields from * it, even if the values are some garbage values; ultimately * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure * that whatever we speculatively found is correct */ vm_file = READ_ONCE(vma->vm_file); if (!vm_file) return NULL; offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); uprobe = find_uprobe_rcu(vm_file->f_inode, offset); if (!uprobe) return NULL; /* now double check that nothing about MM changed */ if (mmap_lock_speculate_retry(mm, seq)) return NULL; return uprobe; } /* assumes being inside RCU protected region */ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; uprobe = find_active_uprobe_speculative(bp_vaddr); if (uprobe) return uprobe; mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) mmf_recalc_uprobes(mm); mmap_read_unlock(mm); return uprobe; } static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) { struct return_consumer *ric; if (unlikely(ri == ZERO_SIZE_PTR)) return ri; if (unlikely(ri->cons_cnt > 0)) { ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); if (!ric) { ri_free(ri); return ZERO_SIZE_PTR; } ri->extra_consumers = ric; } ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; ric->id = id; ric->cookie = cookie; ri->cons_cnt++; return ri; } static struct return_consumer * return_consumer_find(struct return_instance *ri, int *iter, int id) { struct return_consumer *ric; int idx; for (idx = *iter; idx < ri->cons_cnt; idx++) { ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; if (ric->id == id) { *iter = idx + 1; return ric; } } return NULL; } static bool ignore_ret_handler(int rc) { return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; } static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; struct uprobe_task *utask = current->utask; utask->auprobe = &uprobe->arch; list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; __u64 cookie = 0; int rc = 0; if (uc->handler) { rc = uc->handler(uc, regs, &cookie); WARN(rc < 0 || rc > 2, "bad rc=0x%x from %ps()\n", rc, uc->handler); } remove &= rc == UPROBE_HANDLER_REMOVE; has_consumers = true; if (!uc->ret_handler || ignore_ret_handler(rc)) continue; if (!ri) ri = alloc_return_instance(utask); if (session) ri = push_consumer(ri, uc->id, cookie); } utask->auprobe = NULL; if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); if (remove && has_consumers) { down_read(&uprobe->register_rwsem); /* re-check that removal is still required, this time under lock */ if (!filter_chain(uprobe, current->mm)) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); } up_read(&uprobe->register_rwsem); } } static void handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { struct return_consumer *ric; struct uprobe_consumer *uc; int ric_idx = 0; /* all consumers unsubscribed meanwhile */ if (unlikely(!uprobe)) return; rcu_read_lock_trace(); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; if (uc->ret_handler) { ric = return_consumer_find(ri, &ric_idx, uc->id); if (!session || ric) uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); } } rcu_read_unlock_trace(); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) { bool chained; do { chained = ri->chained; ri = ri->next; /* can't be NULL if chained */ } while (chained); return ri; } void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *ri_next, *next_chain; struct uprobe *uprobe; enum hprobe_state hstate; bool valid; utask = current->utask; if (!utask) goto sigill; ri = utask->return_instances; if (!ri) goto sigill; do { /* * We should throw out the frames invalidated by longjmp(). * If this chain is valid, then the next one should be alive * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ next_chain = find_next_ret_chain(ri); valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { /* pop current instance from the stack of pending return instances, * as it's not pending anymore: we just fixed up original * instruction pointer in regs and are about to call handlers; * this allows fixup_uretprobe_trampoline_entries() to properly fix up * captured stack traces from uretprobe handlers, in which pending * trampoline addresses on the stack are replaced with correct * original return addresses */ ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) handle_uretprobe_chain(ri, uprobe, regs); hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ free_ret_instance(utask, ri, false /* !cleanup_hprobe */); ri = ri_next; } while (ri != next_chain); } while (!valid); return; sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); force_sig(SIGILL); } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) { return false; } bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { return true; } /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); rcu_read_lock_trace(); uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't * access this memory. The latter is only possible if * another thread plays with our ->mm. In both cases * we can simply restart. If this vma was unmapped we * can pretend this insn was not executed yet and get * the (correct) SIGSEGV after restart. */ instruction_pointer_set(regs, bp_vaddr); } goto out; } /* change it in advance for ->handler() and restart */ instruction_pointer_set(regs, bp_vaddr); /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; /* * Pairs with the smp_wmb() in prepare_uprobe(). * * Guarantees that if we see the UPROBE_COPY_INSN bit set, then * we must also see the stores to &uprobe->arch performed by the * prepare_uprobe() call. */ smp_rmb(); /* Tracing handlers use ->utask to communicate with fetch methods */ if (!get_utask()) goto out; if (arch_uprobe_ignore(&uprobe->arch, regs)) goto out; handler_chain(uprobe, regs); if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (pre_ssout(uprobe, regs, bp_vaddr)) goto out; out: /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ rcu_read_unlock_trace(); } /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. */ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else WARN_ON_ONCE(1); put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); if (utask->signal_denied) { set_thread_flag(TIF_SIGPENDING); utask->signal_denied = false; } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); force_sig(SIGILL); } } /* * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and * allows the thread to return from interrupt. After that handle_swbp() * sets utask->active_uprobe. * * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag * and allows the thread to return from interrupt. * * While returning to userspace, thread notices the TIF_UPROBE flag and calls * uprobe_notify_resume(). */ void uprobe_notify_resume(struct pt_regs *regs) { struct uprobe_task *utask; clear_thread_flag(TIF_UPROBE); utask = current->utask; if (utask && utask->active_uprobe) handle_singlestep(utask, regs); else handle_swbp(regs); } /* * uprobe_pre_sstep_notifier gets called from interrupt context as part of * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { if (!current->mm) return 0; if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) && (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); return 1; } /* * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. */ int uprobe_post_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (!current->mm || !utask || !utask->active_uprobe) /* task is currently not uprobed */ return 0; utask->state = UTASK_SSTEP_ACK; set_thread_flag(TIF_UPROBE); return 1; } static struct notifier_block uprobe_exception_nb = { .notifier_call = arch_uprobe_exception_notify, .priority = INT_MAX-1, /* notified after kprobes, kgdb */ }; void __init uprobes_init(void) { int i; for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); BUG_ON(register_die_notifier(&uprobe_exception_nb)); }
6535 6610 6616 6613 6554 4635 4642 10 2 2 1 2 10 16 7 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 #include <linux/bpf.h> #include <linux/vmalloc.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/idr.h> #include <linux/namei.h> #include <linux/user_namespace.h> #include <linux/security.h> static bool bpf_ns_capable(struct user_namespace *ns, int cap) { return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN)); } bool bpf_token_capable(const struct bpf_token *token, int cap) { struct user_namespace *userns; /* BPF token allows ns_capable() level of capabilities */ userns = token ? token->userns : &init_user_ns; if (!bpf_ns_capable(userns, cap)) return false; if (token && security_bpf_token_capable(token, cap) < 0) return false; return true; } void bpf_token_inc(struct bpf_token *token) { atomic64_inc(&token->refcnt); } static void bpf_token_free(struct bpf_token *token) { security_bpf_token_free(token); put_user_ns(token->userns); kfree(token); } static void bpf_token_put_deferred(struct work_struct *work) { struct bpf_token *token = container_of(work, struct bpf_token, work); bpf_token_free(token); } void bpf_token_put(struct bpf_token *token) { if (!token) return; if (!atomic64_dec_and_test(&token->refcnt)) return; INIT_WORK(&token->work, bpf_token_put_deferred); schedule_work(&token->work); } static int bpf_token_release(struct inode *inode, struct file *filp) { struct bpf_token *token = filp->private_data; bpf_token_put(token); return 0; } static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) { struct bpf_token *token = filp->private_data; u64 mask; BUILD_BUG_ON(__MAX_BPF_CMD >= 64); mask = BIT_ULL(__MAX_BPF_CMD) - 1; if ((token->allowed_cmds & mask) == mask) seq_printf(m, "allowed_cmds:\tany\n"); else seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1; if ((token->allowed_maps & mask) == mask) seq_printf(m, "allowed_maps:\tany\n"); else seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1; if ((token->allowed_progs & mask) == mask) seq_printf(m, "allowed_progs:\tany\n"); else seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs); BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1; if ((token->allowed_attachs & mask) == mask) seq_printf(m, "allowed_attachs:\tany\n"); else seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs); } #define BPF_TOKEN_INODE_NAME "bpf-token" static const struct inode_operations bpf_token_iops = { }; const struct file_operations bpf_token_fops = { .release = bpf_token_release, .show_fdinfo = bpf_token_show_fdinfo, }; int bpf_token_create(union bpf_attr *attr) { struct bpf_mount_opts *mnt_opts; struct bpf_token *token = NULL; struct user_namespace *userns; struct inode *inode; struct file *file; CLASS(fd, f)(attr->token_create.bpffs_fd); struct path path; struct super_block *sb; umode_t mode; int err, fd; if (fd_empty(f)) return -EBADF; path = fd_file(f)->f_path; sb = path.dentry->d_sb; if (path.dentry != sb->s_root) return -EINVAL; if (sb->s_op != &bpf_super_ops) return -EINVAL; err = path_permission(&path, MAY_ACCESS); if (err) return err; userns = sb->s_user_ns; /* * Enforce that creators of BPF tokens are in the same user * namespace as the BPF FS instance. This makes reasoning about * permissions a lot easier and we can always relax this later. */ if (current_user_ns() != userns) return -EPERM; if (!ns_capable(userns, CAP_BPF)) return -EPERM; /* Creating BPF token in init_user_ns doesn't make much sense. */ if (current_user_ns() == &init_user_ns) return -EOPNOTSUPP; mnt_opts = sb->s_fs_info; if (mnt_opts->delegate_cmds == 0 && mnt_opts->delegate_maps == 0 && mnt_opts->delegate_progs == 0 && mnt_opts->delegate_attachs == 0) return -ENOENT; /* no BPF token delegation is set up */ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); inode = bpf_get_inode(sb, NULL, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &bpf_token_iops; inode->i_fop = &bpf_token_fops; clear_nlink(inode); /* make sure it is unlinked */ file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); if (IS_ERR(file)) { iput(inode); return PTR_ERR(file); } token = kzalloc(sizeof(*token), GFP_USER); if (!token) { err = -ENOMEM; goto out_file; } atomic64_set(&token->refcnt, 1); /* remember bpffs owning userns for future ns_capable() checks */ token->userns = get_user_ns(userns); token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; token->allowed_attachs = mnt_opts->delegate_attachs; err = security_bpf_token_create(token, attr, &path); if (err) goto out_token; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { err = fd; goto out_token; } file->private_data = token; fd_install(fd, file); return fd; out_token: bpf_token_free(token); out_file: fput(file); return err; } int bpf_token_get_info_by_fd(struct bpf_token *token, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_token_info info; u32 info_len = attr->info.info_len; info_len = min_t(u32, info_len, sizeof(info)); memset(&info, 0, sizeof(info)); info.allowed_cmds = token->allowed_cmds; info.allowed_maps = token->allowed_maps; info.allowed_progs = token->allowed_progs; info.allowed_attachs = token->allowed_attachs; if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; return 0; } struct bpf_token *bpf_token_get_from_fd(u32 ufd) { CLASS(fd, f)(ufd); struct bpf_token *token; if (fd_empty(f)) return ERR_PTR(-EBADF); if (fd_file(f)->f_op != &bpf_token_fops) return ERR_PTR(-EINVAL); token = fd_file(f)->private_data; bpf_token_inc(token); return token; } bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) { if (!token) return false; if (!(token->allowed_cmds & BIT_ULL(cmd))) return false; return security_bpf_token_cmd(token, cmd) == 0; } bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) { if (!token || type >= __MAX_BPF_MAP_TYPE) return false; return token->allowed_maps & BIT_ULL(type); } bool bpf_token_allow_prog_type(const struct bpf_token *token, enum bpf_prog_type prog_type, enum bpf_attach_type attach_type) { if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE) return false; return (token->allowed_progs & BIT_ULL(prog_type)) && (token->allowed_attachs & BIT_ULL(attach_type)); }
220 184 3 8 10 303 20 238 58 304 144 7 1 452 454 23 433 195 35 230 6 198 498 498 10 85 2 495 498 168 332 6 2 486 17 144 6 161 115 115 40 39 316 13 20 286 11 179 137 8 4 5 3 11 7 2 5 8 2 6 4 1 1 1 1 39 39 300 6 17 2 2 102 7 203 99 100 104 103 47 61 384 19 45 45 45 45 46 47 47 47 1 1 1 47 1 47 202 203 154 8 1 161 161 161 48 95 95 70 13 55 11 57 20 6 121 19 79 23 28 408 3 3 401 406 25 371 4 1 393 390 260 3 249 8 13 242 207 1 206 89 4 116 53 26 26 1 1 1 259 221 104 104 5 5 159 153 5 12 149 6 109 110 1 109 88 8 16 15 2 13 95 59 46 13 6 6 4 4 245 245 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/blk-mq.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/fsnotify.h> #include <linux/poll.h> #include <linux/nospec.h> #include <linux/compat.h> #include <linux/io_uring/cmd.h> #include <linux/indirect_call_wrapper.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "opdef.h" #include "kbuf.h" #include "alloc_cache.h" #include "rsrc.h" #include "poll.h" #include "rw.h" static void io_complete_rw(struct kiocb *kiocb, long res); static void io_complete_rw_iopoll(struct kiocb *kiocb, long res); struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; u64 addr; u32 len; rwf_t flags; }; static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) { /* If FMODE_NOWAIT is set for a file, we're golden */ if (req->flags & REQ_F_SUPPORT_NOWAIT) return true; /* No FMODE_NOWAIT, if we can poll, check the status */ if (io_file_can_poll(req)) { struct poll_table_struct pt = { ._key = mask }; return vfs_poll(req->file, &pt) & mask; } /* No FMODE_NOWAIT support, and file isn't pollable. Tough luck. */ return false; } static int io_iov_compat_buffer_select_prep(struct io_rw *rw) { struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr); struct compat_iovec iov; if (copy_from_user(&iov, uiov, sizeof(iov))) return -EFAULT; rw->len = iov.iov_len; return 0; } static int io_iov_buffer_select_prep(struct io_kiocb *req) { struct iovec __user *uiov; struct iovec iov; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); if (rw->len != 1) return -EINVAL; if (io_is_compat(req->ctx)) return io_iov_compat_buffer_select_prep(rw); uiov = u64_to_user_ptr(rw->addr); if (copy_from_user(&iov, uiov, sizeof(*uiov))) return -EFAULT; rw->len = iov.iov_len; return 0; } static int io_import_vec(int ddir, struct io_kiocb *req, struct io_async_rw *io, const struct iovec __user *uvec, size_t uvec_segs) { int ret, nr_segs; struct iovec *iov; if (io->vec.iovec) { nr_segs = io->vec.nr; iov = io->vec.iovec; } else { nr_segs = 1; iov = &io->fast_iov; } ret = __import_iovec(ddir, uvec, uvec_segs, nr_segs, &iov, &io->iter, io_is_compat(req->ctx)); if (unlikely(ret < 0)) return ret; if (iov) { req->flags |= REQ_F_NEED_CLEANUP; io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs); } return 0; } static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, struct io_async_rw *io, unsigned int issue_flags) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); void __user *buf = u64_to_user_ptr(rw->addr); size_t sqe_len = rw->len; if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) return io_import_vec(ddir, req, io, buf, sqe_len); if (io_do_buffer_select(req)) { buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); if (!buf) return -ENOBUFS; rw->addr = (unsigned long) buf; rw->len = sqe_len; } return import_ubuf(ddir, buf, sqe_len, &io->iter); } static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, struct io_async_rw *io, unsigned int issue_flags) { int ret; ret = __io_import_rw_buffer(rw, req, io, issue_flags); if (unlikely(ret < 0)) return ret; iov_iter_save_state(&io->iter, &io->iter_state); return 0; } static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_rw *rw = req->async_data; if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) return; io_alloc_cache_vec_kasan(&rw->vec); if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) io_vec_free(&rw->vec); if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } } static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) { /* * Disable quick recycling for anything that's gone through io-wq. * In theory, this should be fine to cleanup. However, some read or * write iter handling touches the iovec AFTER having called into the * handler, eg to reexpand or revert. This means we can have: * * task io-wq * issue * punt to io-wq * issue * blkdev_write_iter() * ->ki_complete() * io_complete_rw() * queue tw complete * run tw * req_rw_cleanup * iov_iter_count() <- look at iov_iter again * * which can lead to a UAF. This is only possible for io-wq offload * as the cleanup can run in parallel. As io-wq is not the fast path, * just leave cleanup to the end. * * This is really a bug in the core code that does this, any issue * path should assume that a successful (or -EIOCBQUEUED) return can * mean that the underlying data can be gone at any time. But that * should be fixed seperately, and then this check could be killed. */ if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { req->flags &= ~REQ_F_NEED_CLEANUP; io_rw_recycle(req, issue_flags); } } static int io_rw_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_async_rw *rw; rw = io_uring_alloc_async_data(&ctx->rw_cache, req); if (!rw) return -ENOMEM; if (rw->vec.iovec) req->flags |= REQ_F_NEED_CLEANUP; rw->bytes_done = 0; return 0; } static inline void io_meta_save_state(struct io_async_rw *io) { io->meta_state.seed = io->meta.seed; iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); } static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) { if (kiocb->ki_flags & IOCB_HAS_METADATA) { io->meta.seed = io->meta_state.seed; iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); } } static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, u64 attr_ptr, u64 attr_type_mask) { struct io_uring_attr_pi pi_attr; struct io_async_rw *io; int ret; if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr), sizeof(pi_attr))) return -EFAULT; if (pi_attr.rsvd) return -EINVAL; io = req->async_data; io->meta.flags = pi_attr.flags; io->meta.app_tag = pi_attr.app_tag; io->meta.seed = pi_attr.seed; ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr), pi_attr.len, &io->meta.iter); if (unlikely(ret < 0)) return ret; req->flags |= REQ_F_HAS_METADATA; io_meta_save_state(io); return ret; } static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_async_rw *io; unsigned ioprio; u64 attr_type_mask; int ret; if (io_rw_alloc_async(req)) return -ENOMEM; io = req->async_data; rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); io->buf_group = req->buf_index; ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { ret = ioprio_check_cap(ioprio); if (ret) return ret; rw->kiocb.ki_ioprio = ioprio; } else { rw->kiocb.ki_ioprio = get_current_ioprio(); } rw->kiocb.dio_complete = NULL; rw->kiocb.ki_flags = 0; rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream); if (req->ctx->flags & IORING_SETUP_IOPOLL) rw->kiocb.ki_complete = io_complete_rw_iopoll; else rw->kiocb.ki_complete = io_complete_rw; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags); attr_type_mask = READ_ONCE(sqe->attr_type_mask); if (attr_type_mask) { u64 attr_ptr; /* only PI attribute is supported currently */ if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) return -EINVAL; attr_ptr = READ_ONCE(sqe->attr_ptr); return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); } return 0; } static int io_rw_do_import(struct io_kiocb *req, int ddir) { if (io_do_buffer_select(req)) return 0; return io_import_rw_buffer(ddir, req, req->async_data, 0); } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { int ret; ret = __io_prep_rw(req, sqe, ddir); if (unlikely(ret)) return ret; return io_rw_do_import(req, ddir); } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return io_prep_rw(req, sqe, ITER_SOURCE); } static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { int ret; ret = io_prep_rw(req, sqe, ddir); if (unlikely(ret)) return ret; if (!(req->flags & REQ_F_BUFFER_SELECT)) return 0; /* * Have to do this validation here, as this is in io_read() rw->len * might have chanaged due to buffer selection */ return io_iov_buffer_select_prep(req); } int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return io_prep_rwv(req, sqe, ITER_DEST); } int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return io_prep_rwv(req, sqe, ITER_SOURCE); } static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_async_rw *io = req->async_data; int ret; if (io->bytes_done) return 0; ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, issue_flags); iov_iter_save_state(&io->iter, &io->iter_state); return ret; } int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return __io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return __io_prep_rw(req, sqe, ITER_SOURCE); } static int io_rw_import_reg_vec(struct io_kiocb *req, struct io_async_rw *io, int ddir, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned uvec_segs = rw->len; int ret; ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, uvec_segs, issue_flags); if (unlikely(ret)) return ret; iov_iter_save_state(&io->iter, &io->iter_state); req->flags &= ~REQ_F_IMPORT_BUFFER; return 0; } static int io_rw_prep_reg_vec(struct io_kiocb *req) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_async_rw *io = req->async_data; const struct iovec __user *uvec; uvec = u64_to_user_ptr(rw->addr); return io_prep_reg_iovec(req, &io->vec, uvec, rw->len); } int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; ret = __io_prep_rw(req, sqe, ITER_DEST); if (unlikely(ret)) return ret; return io_rw_prep_reg_vec(req); } int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; ret = __io_prep_rw(req, sqe, ITER_SOURCE); if (unlikely(ret)) return ret; return io_rw_prep_reg_vec(req); } /* * Multishot read is prepared just like a normal read/write request, only * difference is that we set the MULTISHOT flag. */ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); int ret; /* must be used with provided buffers */ if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; ret = __io_prep_rw(req, sqe, ITER_DEST); if (unlikely(ret)) return ret; if (rw->addr || rw->len) return -EINVAL; req->flags |= REQ_F_APOLL_MULTISHOT; return 0; } void io_readv_writev_cleanup(struct io_kiocb *req) { lockdep_assert_held(&req->ctx->uring_lock); io_rw_recycle(req, 0); } static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); if (rw->kiocb.ki_pos != -1) return &rw->kiocb.ki_pos; if (!(req->file->f_mode & FMODE_STREAM)) { req->flags |= REQ_F_CUR_POS; rw->kiocb.ki_pos = req->file->f_pos; return &rw->kiocb.ki_pos; } rw->kiocb.ki_pos = 0; return NULL; } static bool io_rw_should_reissue(struct io_kiocb *req) { #ifdef CONFIG_BLOCK struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); umode_t mode = file_inode(req->file)->i_mode; struct io_async_rw *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && !(ctx->flags & IORING_SETUP_IOPOLL))) return false; /* * If ref is dying, we might be running poll reap from the exit work. * Don't attempt to reissue from that path, just let it fail with * -EAGAIN. */ if (percpu_ref_is_dying(&ctx->refs)) return false; io_meta_restore(io, &rw->kiocb); iov_iter_restore(&io->iter, &io->iter_state); return true; #else return false; #endif } static void io_req_end_write(struct io_kiocb *req) { if (req->flags & REQ_F_ISREG) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); kiocb_end_write(&rw->kiocb); } } /* * Trigger the notifications after having done some IO, and finish the write * accounting, if any. */ static void io_req_io_end(struct io_kiocb *req) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); if (rw->kiocb.ki_flags & IOCB_WRITE) { io_req_end_write(req); fsnotify_modify(req->file); } else { fsnotify_access(req->file); } } static void __io_complete_rw_common(struct io_kiocb *req, long res) { if (res == req->cqe.res) return; if (res == -EAGAIN && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; } else { req_set_fail(req); req->cqe.res = res; } } static inline int io_fixup_rw_res(struct io_kiocb *req, long res) { struct io_async_rw *io = req->async_data; /* add previously done IO, if any */ if (req_has_async_data(req) && io->bytes_done > 0) { if (res < 0) res = io->bytes_done; else res += io->bytes_done; } return res; } void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) { long res = kiocb->dio_complete(rw->kiocb.private); io_req_set_res(req, io_fixup_rw_res(req, res), 0); } io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); io_req_rw_cleanup(req, 0); io_req_task_complete(req, tw); } static void io_complete_rw(struct kiocb *kiocb, long res) { struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); struct io_kiocb *req = cmd_to_io_kiocb(rw); if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { __io_complete_rw_common(req, res); io_req_set_res(req, io_fixup_rw_res(req, res), 0); } req->io_task_work.func = io_req_rw_complete; __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) { struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); struct io_kiocb *req = cmd_to_io_kiocb(rw); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; else req->cqe.res = res; } /* order with io_iopoll_complete() checking ->iopoll_completed */ smp_store_release(&req->iopoll_completed, 1); } static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); /* IO was queued async, completion will happen later */ if (ret == -EIOCBQUEUED) return; /* transform internal restart error codes */ if (unlikely(ret < 0)) { switch (ret) { case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: case -ERESTART_RESTARTBLOCK: /* * We can't just restart the syscall, since previously * submitted sqes may already be in progress. Just fail * this IO with EINTR. */ ret = -EINTR; break; } } if (req->ctx->flags & IORING_SETUP_IOPOLL) io_complete_rw_iopoll(&rw->kiocb, ret); else io_complete_rw(&rw->kiocb, ret); } static int kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned final_ret = io_fixup_rw_res(req, ret); if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { __io_complete_rw_common(req, ret); /* * Safe to call io_end from here as we're inline * from the submission path. */ io_req_io_end(req); io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); io_req_rw_cleanup(req, issue_flags); return IOU_COMPLETE; } else { io_rw_done(req, ret); } return IOU_ISSUE_SKIP_COMPLETE; } static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; } /* * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. */ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) { struct io_kiocb *req = cmd_to_io_kiocb(rw); struct kiocb *kiocb = &rw->kiocb; struct file *file = kiocb->ki_filp; ssize_t ret = 0; loff_t *ppos; /* * Don't support polled IO through this interface, and we can't * support non-blocking either. For the latter, this just causes * the kiocb to be handled from an async context. */ if (kiocb->ki_flags & IOCB_HIPRI) return -EOPNOTSUPP; if ((kiocb->ki_flags & IOCB_NOWAIT) && !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) return -EFAULT; ppos = io_kiocb_ppos(kiocb); while (iov_iter_count(iter)) { void __user *addr; size_t len; ssize_t nr; if (iter_is_ubuf(iter)) { addr = iter->ubuf + iter->iov_offset; len = iov_iter_count(iter); } else if (!iov_iter_is_bvec(iter)) { addr = iter_iov_addr(iter); len = iter_iov_len(iter); } else { addr = u64_to_user_ptr(rw->addr); len = rw->len; } if (ddir == READ) nr = file->f_op->read(file, addr, len, ppos); else nr = file->f_op->write(file, addr, len, ppos); if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (!iov_iter_is_bvec(iter)) { iov_iter_advance(iter, nr); } else { rw->addr += nr; rw->len -= nr; if (!rw->len) break; } if (nr != len) break; } return ret; } /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. * This gets called when the page is unlocked, and we generally expect that to * happen when the page IO is completed and the page is now uptodate. This will * queue a task_work based retry of the operation, attempting to copy the data * again. If the latter fails because the page was NOT uptodate, then we will * do a thread based blocking retry of the operation. That's the unexpected * slow path. */ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, int sync, void *arg) { struct wait_page_queue *wpq; struct io_kiocb *req = wait->private; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct wait_page_key *key = arg; wpq = container_of(wait, struct wait_page_queue, wait); if (!wake_page_match(wpq, key)) return 0; rw->kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); io_req_task_queue(req); return 1; } /* * This controls whether a given IO request should be armed for async page * based retry. If we return false here, the request is handed to the async * worker threads for retry. If we're doing buffered reads on a regular file, * we prepare a private wait_page_queue entry and retry the operation. This * will either succeed because the page is now uptodate and unlocked, or it * will register a callback when the page is unlocked at IO completion. Through * that callback, io_uring uses task_work to setup a retry of the operation. * That retry will attempt the buffered read again. The retry will generally * succeed, or in rare cases where it fails, we then fall back to using the * async worker threads for a blocking retry. */ static bool io_rw_should_retry(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; struct wait_page_queue *wait = &io->wpq; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; /* * Never retry for NOWAIT or a request with metadata, we just complete * with -EAGAIN. */ if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) return false; /* Only for buffered IO */ if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks */ if (io_file_can_poll(req) || !(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC)) return false; wait->wait.func = io_async_buf_func; wait->wait.private = req; wait->wait.flags = 0; INIT_LIST_HEAD(&wait->wait.entry); kiocb->ki_flags |= IOCB_WAITQ; kiocb->ki_flags &= ~IOCB_NOWAIT; kiocb->ki_waitq = wait; return true; } static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) { struct file *file = rw->kiocb.ki_filp; if (likely(file->f_op->read_iter)) return file->f_op->read_iter(&rw->kiocb, iter); else if (file->f_op->read) return loop_rw_iter(READ, rw, iter); else return -EINVAL; } static bool need_complete_io(struct io_kiocb *req) { return req->flags & REQ_F_ISREG || S_ISBLK(file_inode(req->file)->i_mode); } static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; struct io_ring_ctx *ctx = req->ctx; struct file *file = req->file; int ret; if (unlikely(!(file->f_mode & mode))) return -EBADF; if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); kiocb->ki_flags = file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; kiocb->ki_flags |= IOCB_ALLOC_CACHE; /* * If the file is marked O_NONBLOCK, still allow retry for it if it * supports async. Otherwise it's impossible to use O_NONBLOCK files * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ if (kiocb->ki_flags & IOCB_NOWAIT || ((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT)))) req->flags |= REQ_F_NOWAIT; if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; req->iopoll_completed = 0; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { /* make sure every req only blocks once*/ req->flags &= ~REQ_F_IOPOLL_STATE; req->iopoll_start = ktime_get_ns(); } } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; } if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; /* * We have a union of meta fields with wpq used for buffered-io * in io_async_rw, so fail it here. */ if (!(req->file->f_flags & O_DIRECT)) return -EOPNOTSUPP; kiocb->ki_flags |= IOCB_HAS_METADATA; kiocb->private = &io->meta; } return 0; } static int __io_read(struct io_kiocb *req, unsigned int issue_flags) { bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; ssize_t ret; loff_t *ppos; if (req->flags & REQ_F_IMPORT_BUFFER) { ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags); if (unlikely(ret)) return ret; } else if (io_do_buffer_select(req)) { ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; } ret = io_rw_init_file(req, FMODE_READ, READ); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ if (unlikely(!io_file_supports_nowait(req, EPOLLIN))) return -EAGAIN; kiocb->ki_flags |= IOCB_NOWAIT; } else { /* Ensure we clear previously set non-block flag */ kiocb->ki_flags &= ~IOCB_NOWAIT; } ppos = io_kiocb_update_pos(req); ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); if (unlikely(ret)) return ret; ret = io_iter_do_read(rw, &io->iter); /* * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT * issue, even though they should be returning -EAGAIN. To be safe, * retry from blocking context for either. */ if (ret == -EOPNOTSUPP && force_nonblock) ret = -EAGAIN; if (ret == -EAGAIN) { /* If we can poll, just do that. */ if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) goto done; ret = 0; } else if (ret == -EIOCBQUEUED) { return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || (issue_flags & IO_URING_F_MULTISHOT)) { /* read all, failed, already did sync or don't want to retry */ goto done; } /* * Don't depend on the iter state matching what was consumed, or being * untouched in case of error. Restore it and we'll advance it * manually if we need to. */ iov_iter_restore(&io->iter, &io->iter_state); io_meta_restore(io, kiocb); do { /* * We end up here because of a partial read, either from * above or inside this loop. Advance the iter by the bytes * that were consumed. */ iov_iter_advance(&io->iter, ret); if (!iov_iter_count(&io->iter)) break; io->bytes_done += ret; iov_iter_save_state(&io->iter, &io->iter_state); /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { kiocb->ki_flags &= ~IOCB_WAITQ; return -EAGAIN; } req->cqe.res = iov_iter_count(&io->iter); /* * Now retry read with the IOCB_WAITQ parts set in the iocb. If * we get -EIOCBQUEUED, then we'll get a notification when the * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ ret = io_iter_do_read(rw, &io->iter); if (ret == -EIOCBQUEUED) return IOU_ISSUE_SKIP_COMPLETE; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; iov_iter_restore(&io->iter, &io->iter_state); } while (ret > 0); done: /* it's faster to check here then delegate to kfree */ return ret; } int io_read(struct io_kiocb *req, unsigned int issue_flags) { int ret; ret = __io_read(req, issue_flags); if (ret >= 0) return kiocb_done(req, ret, issue_flags); return ret; } int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned int cflags = 0; int ret; /* * Multishot MUST be used on a pollable file */ if (!io_file_can_poll(req)) return -EBADFD; /* make it sync, multishot doesn't support async execution */ rw->kiocb.ki_complete = NULL; ret = __io_read(req, issue_flags); /* * If we get -EAGAIN, recycle our buffer and just let normal poll * handling arm it. */ if (ret == -EAGAIN) { /* * Reset rw->len to 0 again to avoid clamping future mshot * reads, in case the buffer size varies. */ if (io_kbuf_recycle(req, issue_flags)) rw->len = 0; return IOU_RETRY; } else if (ret <= 0) { io_kbuf_recycle(req, issue_flags); if (ret < 0) req_set_fail(req); } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { cflags = io_put_kbuf(req, ret, issue_flags); } else { /* * Any successful return value will keep the multishot read * armed, if it's still set. Put our buffer and post a CQE. If * we fail to post a CQE, or multishot is no longer set, then * jump to the termination path. This request is then done. */ cflags = io_put_kbuf(req, ret, issue_flags); rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (issue_flags & IO_URING_F_MULTISHOT) /* * Force retry, as we might have more data to * be read and otherwise it won't get retried * until (if ever) another poll is triggered. */ io_poll_multishot_retry(req); return IOU_RETRY; } } /* * Either an error, or we've hit overflow posting the CQE. For any * multishot request, hitting overflow will terminate it. */ io_req_set_res(req, ret, cflags); io_req_rw_cleanup(req, issue_flags); return IOU_COMPLETE; } static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) { struct inode *inode; bool ret; if (!(req->flags & REQ_F_ISREG)) return true; if (!(kiocb->ki_flags & IOCB_NOWAIT)) { kiocb_start_write(kiocb); return true; } inode = file_inode(kiocb->ki_filp); ret = sb_start_write_trylock(inode->i_sb); if (ret) __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); return ret; } int io_write(struct io_kiocb *req, unsigned int issue_flags) { bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; ssize_t ret, ret2; loff_t *ppos; if (req->flags & REQ_F_IMPORT_BUFFER) { ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags); if (unlikely(ret)) return ret; } ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ if (unlikely(!io_file_supports_nowait(req, EPOLLOUT))) goto ret_eagain; /* Check if we can support NOWAIT. */ if (!(kiocb->ki_flags & IOCB_DIRECT) && !(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) && (req->flags & REQ_F_ISREG)) goto ret_eagain; kiocb->ki_flags |= IOCB_NOWAIT; } else { /* Ensure we clear previously set non-block flag */ kiocb->ki_flags &= ~IOCB_NOWAIT; } ppos = io_kiocb_update_pos(req); ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); if (unlikely(ret)) return ret; if (unlikely(!io_kiocb_start_write(req, kiocb))) return -EAGAIN; kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) ret2 = req->file->f_op->write_iter(kiocb, &io->iter); else if (req->file->f_op->write) ret2 = loop_rw_iter(WRITE, rw, &io->iter); else ret2 = -EINVAL; /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. */ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, req->cqe.res, ret2); /* This is a partial write. The file pos has already been * updated, setup the async struct to complete the request * in the worker. Also update bytes_done to account for * the bytes already written. */ iov_iter_save_state(&io->iter, &io->iter_state); io->bytes_done += ret2; if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; } done: return kiocb_done(req, ret2, issue_flags); } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); io_meta_restore(io, kiocb); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; } } int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags) { int ret; ret = io_init_rw_fixed(req, issue_flags, ITER_DEST); if (unlikely(ret)) return ret; return io_read(req, issue_flags); } int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags) { int ret; ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE); if (unlikely(ret)) return ret; return io_write(req, issue_flags); } void io_rw_fail(struct io_kiocb *req) { int res; res = io_fixup_rw_res(req, req->cqe.res); io_req_set_res(req, res, req->cqe.flags); } static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob, unsigned int poll_flags) { struct file *file = req->file; if (req->opcode == IORING_OP_URING_CMD) { struct io_uring_cmd *ioucmd; ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags); } else { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); return file->f_op->iopoll(&rw->kiocb, iob, poll_flags); } } static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct hrtimer_sleeper timer; enum hrtimer_mode mode; ktime_t kt; u64 sleep_time; if (req->flags & REQ_F_IOPOLL_STATE) return 0; if (ctx->hybrid_poll_time == LLONG_MAX) return 0; /* Using half the running time to do schedule */ sleep_time = ctx->hybrid_poll_time / 2; kt = ktime_set(0, sleep_time); req->flags |= REQ_F_IOPOLL_STATE; mode = HRTIMER_MODE_REL; hrtimer_setup_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode); hrtimer_set_expires(&timer.timer, kt); set_current_state(TASK_INTERRUPTIBLE); hrtimer_sleeper_start_expires(&timer, mode); if (timer.task) io_schedule(); hrtimer_cancel(&timer.timer); __set_current_state(TASK_RUNNING); destroy_hrtimer_on_stack(&timer.timer); return sleep_time; } static int io_uring_hybrid_poll(struct io_kiocb *req, struct io_comp_batch *iob, unsigned int poll_flags) { struct io_ring_ctx *ctx = req->ctx; u64 runtime, sleep_time; int ret; sleep_time = io_hybrid_iopoll_delay(ctx, req); ret = io_uring_classic_poll(req, iob, poll_flags); runtime = ktime_get_ns() - req->iopoll_start - sleep_time; /* * Use minimum sleep time if we're polling devices with different * latencies. We could get more completions from the faster ones. */ if (ctx->hybrid_poll_time > runtime) ctx->hybrid_poll_time = runtime; return ret; } int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; unsigned int poll_flags = 0; DEFINE_IO_COMP_BATCH(iob); int nr_events = 0; /* * Only spin for completions if we don't have multiple devices hanging * off our complete list. */ if (ctx->poll_multi_queue || force_nonspin) poll_flags |= BLK_POLL_ONESHOT; wq_list_for_each(pos, start, &ctx->iopoll_list) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); int ret; /* * Move completed and retryable entries to our local lists. * If we find a request that requires polling, break out * and complete those lists first, if we have entries there. */ if (READ_ONCE(req->iopoll_completed)) break; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) ret = io_uring_hybrid_poll(req, &iob, poll_flags); else ret = io_uring_classic_poll(req, &iob, poll_flags); if (unlikely(ret < 0)) return ret; else if (ret) poll_flags |= BLK_POLL_ONESHOT; /* iopoll may have completed current req */ if (!rq_list_empty(&iob.req_list) || READ_ONCE(req->iopoll_completed)) break; } if (!rq_list_empty(&iob.req_list)) iob.complete(&iob); else if (!pos) return 0; prev = start; wq_list_for_each_resume(pos, prev) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); /* order with io_complete_rw_iopoll(), e.g. ->result updates */ if (!smp_load_acquire(&req->iopoll_completed)) break; nr_events++; req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); if (req->opcode != IORING_OP_URING_CMD) io_req_rw_cleanup(req, 0); } if (unlikely(!nr_events)) return 0; pos = start ? start->next : ctx->iopoll_list.first; wq_list_cut(&ctx->iopoll_list, prev, start); if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) return 0; ctx->submit_state.compl_reqs.first = pos; __io_submit_flush_completions(ctx); return nr_events; } void io_rw_cache_free(const void *entry) { struct io_async_rw *rw = (struct io_async_rw *) entry; io_vec_free(&rw->vec); kfree(rw); }
1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtual PTP 1588 clock for use with KVM guests * * Copyright (C) 2017 Red Hat Inc. */ #include <linux/device.h> #include <linux/kernel.h> #include <asm/pvclock.h> #include <asm/kvmclock.h> #include <linux/module.h> #include <uapi/asm/kvm_para.h> #include <uapi/linux/kvm_para.h> #include <linux/ptp_clock_kernel.h> #include <linux/ptp_kvm.h> #include <linux/set_memory.h> static phys_addr_t clock_pair_gpa; static struct kvm_clock_pairing clock_pair_glbl; static struct kvm_clock_pairing *clock_pair; int kvm_arch_ptp_init(void) { struct page *p; long ret; if (!kvm_para_available()) return -EOPNOTSUPP; if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { p = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!p) return -ENOMEM; clock_pair = page_address(p); ret = set_memory_decrypted((unsigned long)clock_pair, 1); if (ret) { __free_page(p); clock_pair = NULL; goto nofree; } } else { clock_pair = &clock_pair_glbl; } clock_pair_gpa = slow_virt_to_phys(clock_pair); if (!pvclock_get_pvti_cpu0_va()) { ret = -EOPNOTSUPP; goto err; } ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, KVM_CLOCK_PAIRING_WALLCLOCK); if (ret == -KVM_ENOSYS) { ret = -EOPNOTSUPP; goto err; } return ret; err: kvm_arch_ptp_exit(); nofree: return ret; } void kvm_arch_ptp_exit(void) { if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { WARN_ON(set_memory_encrypted((unsigned long)clock_pair, 1)); free_page((unsigned long)clock_pair); clock_pair = NULL; } } int kvm_arch_ptp_get_clock(struct timespec64 *ts) { long ret; ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, KVM_CLOCK_PAIRING_WALLCLOCK); if (ret != 0) { pr_err_ratelimited("clock offset hypercall ret %lu\n", ret); return -EOPNOTSUPP; } ts->tv_sec = clock_pair->sec; ts->tv_nsec = clock_pair->nsec; return 0; } int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec, enum clocksource_ids *cs_id) { struct pvclock_vcpu_time_info *src; unsigned int version; long ret; src = this_cpu_pvti(); do { /* * We are using a TSC value read in the hosts * kvm_hc_clock_pairing handling. * So any changes to tsc_to_system_mul * and tsc_shift or any other pvclock * data invalidate that measurement. */ version = pvclock_read_begin(src); ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, KVM_CLOCK_PAIRING_WALLCLOCK); if (ret != 0) { pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret); return -EOPNOTSUPP; } tspec->tv_sec = clock_pair->sec; tspec->tv_nsec = clock_pair->nsec; *cycle = __pvclock_read_cycles(src, clock_pair->tsc); } while (pvclock_read_retry(src, version)); *cs_id = CSID_X86_KVM_CLK; return 0; }
1459 18 1343 1343 263 277 398 72 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 /* SPDX-License-Identifier: GPL-2.0-only */ /* * net busy poll support * Copyright(c) 2013 Intel Corporation. * * Author: Eliezer Tamir * * Contact Information: * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> */ #ifndef _LINUX_NET_BUSY_POLL_H #define _LINUX_NET_BUSY_POLL_H #include <linux/netdevice.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> #include <net/xdp.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) static inline bool napi_id_valid(unsigned int napi_id) { return napi_id >= MIN_NAPI_ID; } #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { return READ_ONCE(sysctl_net_busy_poll); } static inline bool sk_can_busy_loop(const struct sock *sk) { return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); } bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); void napi_suspend_irqs(unsigned int napi_id); void napi_resume_irqs(unsigned int napi_id); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { return 0; } static inline bool sk_can_busy_loop(struct sock *sk) { return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif } /* in poll/select we use the global sysctl_net_ll_poll value */ static inline bool busy_loop_timeout(unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline bool sk_busy_loop_timeout(struct sock *sk, unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id_valid(napi_id)) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } /* used in the NIC receive handler to mark the skb */ static inline void __skb_mark_napi_id(struct sk_buff *skb, const struct gro_node *gro) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ if (!napi_id_valid(skb->napi_id)) skb->napi_id = gro->cached_napi_id; #endif } static inline void skb_mark_napi_id(struct sk_buff *skb, const struct napi_struct *napi) { __skb_mark_napi_id(skb, &napi->gro); } /* used in the protocol handler to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_update(sk, skb); } /* Variant of sk_mark_napi_id() for passive flow setup, * as sk->sk_napi_id and sk->sk_rx_queue_mapping content * needs to be set. */ static inline void sk_mark_napi_id_set(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) WRITE_ONCE(sk->sk_napi_id, napi_id); #endif } /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, skb->napi_id); #endif } #endif /* _LINUX_NET_BUSY_POLL_H */
1 1 1 40 3 5 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 // SPDX-License-Identifier: GPL-2.0-or-later /* * IP Payload Compression Protocol (IPComp) - RFC3173. * * Copyright (c) 2003 James Morris <jmorris@intercode.com.au> * Copyright (c) 2003-2025 Herbert Xu <herbert@gondor.apana.org.au> * * Todo: * - Tunable compression parameters. * - Compression stats. * - Adaptive compression. */ #include <crypto/acompress.h> #include <linux/err.h> #include <linux/module.h> #include <linux/skbuff_ref.h> #include <linux/slab.h> #include <net/ipcomp.h> #include <net/xfrm.h> #define IPCOMP_SCRATCH_SIZE 65400 struct ipcomp_skb_cb { struct xfrm_skb_cb xfrm; struct acomp_req *req; }; struct ipcomp_data { u16 threshold; struct crypto_acomp *tfm; }; struct ipcomp_req_extra { struct xfrm_state *x; struct scatterlist sg[]; }; static inline struct ipcomp_skb_cb *ipcomp_cb(struct sk_buff *skb) { struct ipcomp_skb_cb *cb = (void *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof(skb->cb)); return cb; } static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen) { struct acomp_req *req = ipcomp_cb(skb)->req; struct ipcomp_req_extra *extra; struct scatterlist *dsg; int len, dlen; if (unlikely(err)) goto out_free_req; extra = acomp_request_extra(req); dsg = extra->sg; dlen = req->dlen; pskb_trim_unique(skb, 0); __skb_put(skb, hlen); /* Only update truesize on input. */ if (!hlen) skb->truesize += dlen; skb->data_len = dlen; skb->len += dlen; do { skb_frag_t *frag; struct page *page; frag = skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags; page = sg_page(dsg); dsg = sg_next(dsg); len = PAGE_SIZE; if (dlen < len) len = dlen; skb_frag_fill_page_desc(frag, page, 0, len); skb_shinfo(skb)->nr_frags++; } while ((dlen -= len)); for (; dsg; dsg = sg_next(dsg)) __free_page(sg_page(dsg)); out_free_req: acomp_request_free(req); return err; } static int ipcomp_input_done2(struct sk_buff *skb, int err) { struct ip_comp_hdr *ipch = ip_comp_hdr(skb); const int plen = skb->len; skb->transport_header = skb->network_header + sizeof(*ipch); return ipcomp_post_acomp(skb, err, 0) ?: skb->len < (plen + sizeof(ip_comp_hdr)) ? -EINVAL : ipch->nexthdr; } static void ipcomp_input_done(void *data, int err) { struct sk_buff *skb = data; xfrm_input_resume(skb, ipcomp_input_done2(skb, err)); } static struct acomp_req *ipcomp_setup_req(struct xfrm_state *x, struct sk_buff *skb, int minhead, int dlen) { const int dnfrags = min(MAX_SKB_FRAGS, 16); struct ipcomp_data *ipcd = x->data; struct ipcomp_req_extra *extra; struct scatterlist *sg, *dsg; const int plen = skb->len; struct crypto_acomp *tfm; struct acomp_req *req; int nfrags; int total; int err; int i; ipcomp_cb(skb)->req = NULL; do { struct sk_buff *trailer; if (skb->len > PAGE_SIZE) { if (skb_linearize_cow(skb)) return ERR_PTR(-ENOMEM); nfrags = 1; break; } if (!skb_cloned(skb) && skb_headlen(skb) >= minhead) { if (!skb_is_nonlinear(skb)) { nfrags = 1; break; } else if (!skb_has_frag_list(skb)) { nfrags = skb_shinfo(skb)->nr_frags; nfrags++; break; } } nfrags = skb_cow_data(skb, skb_headlen(skb) < minhead ? minhead - skb_headlen(skb) : 0, &trailer); if (nfrags < 0) return ERR_PTR(nfrags); } while (0); tfm = ipcd->tfm; req = acomp_request_alloc_extra( tfm, sizeof(*extra) + sizeof(*sg) * (nfrags + dnfrags), GFP_ATOMIC); ipcomp_cb(skb)->req = req; if (!req) return ERR_PTR(-ENOMEM); extra = acomp_request_extra(req); extra->x = x; dsg = extra->sg; sg = dsg + dnfrags; sg_init_table(sg, nfrags); err = skb_to_sgvec(skb, sg, 0, plen); if (unlikely(err < 0)) return ERR_PTR(err); sg_init_table(dsg, dnfrags); total = 0; for (i = 0; i < dnfrags && total < dlen; i++) { struct page *page; page = alloc_page(GFP_ATOMIC); if (!page) break; sg_set_page(dsg + i, page, PAGE_SIZE, 0); total += PAGE_SIZE; } if (!i) return ERR_PTR(-ENOMEM); sg_mark_end(dsg + i - 1); dlen = min(dlen, total); acomp_request_set_params(req, sg, dsg, plen, dlen); return req; } static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { struct acomp_req *req; int err; req = ipcomp_setup_req(x, skb, 0, IPCOMP_SCRATCH_SIZE); err = PTR_ERR(req); if (IS_ERR(req)) goto out; acomp_request_set_callback(req, 0, ipcomp_input_done, skb); err = crypto_acomp_decompress(req); if (err == -EINPROGRESS) return err; out: return ipcomp_input_done2(skb, err); } int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { struct ip_comp_hdr *ipch __maybe_unused; if (!pskb_may_pull(skb, sizeof(*ipch))) return -EINVAL; skb->ip_summed = CHECKSUM_NONE; /* Remove ipcomp header and decompress original payload */ __skb_pull(skb, sizeof(*ipch)); return ipcomp_decompress(x, skb); } EXPORT_SYMBOL_GPL(ipcomp_input); static int ipcomp_output_push(struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); return 0; } static int ipcomp_output_done2(struct xfrm_state *x, struct sk_buff *skb, int err) { struct ip_comp_hdr *ipch; err = ipcomp_post_acomp(skb, err, sizeof(*ipch)); if (err) goto out_ok; /* Install ipcomp header, convert into ipcomp datagram. */ ipch = ip_comp_hdr(skb); ipch->nexthdr = *skb_mac_header(skb); ipch->flags = 0; ipch->cpi = htons((u16 )ntohl(x->id.spi)); *skb_mac_header(skb) = IPPROTO_COMP; out_ok: return ipcomp_output_push(skb); } static void ipcomp_output_done(void *data, int err) { struct ipcomp_req_extra *extra; struct sk_buff *skb = data; struct acomp_req *req; req = ipcomp_cb(skb)->req; extra = acomp_request_extra(req); xfrm_output_resume(skb_to_full_sk(skb), skb, ipcomp_output_done2(extra->x, skb, err)); } static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) { struct ip_comp_hdr *ipch __maybe_unused; struct acomp_req *req; int err; req = ipcomp_setup_req(x, skb, sizeof(*ipch), skb->len - sizeof(*ipch)); err = PTR_ERR(req); if (IS_ERR(req)) goto out; acomp_request_set_callback(req, 0, ipcomp_output_done, skb); err = crypto_acomp_compress(req); if (err == -EINPROGRESS) return err; out: return ipcomp_output_done2(x, skb, err); } int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipcomp_data *ipcd = x->data; if (skb->len < ipcd->threshold) { /* Don't bother compressing */ return ipcomp_output_push(skb); } return ipcomp_compress(x, skb); } EXPORT_SYMBOL_GPL(ipcomp_output); static void ipcomp_free_data(struct ipcomp_data *ipcd) { crypto_free_acomp(ipcd->tfm); } void ipcomp_destroy(struct xfrm_state *x) { struct ipcomp_data *ipcd = x->data; if (!ipcd) return; ipcomp_free_data(ipcd); kfree(ipcd); } EXPORT_SYMBOL_GPL(ipcomp_destroy); int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { int err; struct ipcomp_data *ipcd; struct xfrm_algo_desc *calg_desc; err = -EINVAL; if (!x->calg) { NL_SET_ERR_MSG(extack, "Missing required compression algorithm"); goto out; } if (x->encap) { NL_SET_ERR_MSG(extack, "IPComp is not compatible with encapsulation"); goto out; } err = -ENOMEM; ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); if (!ipcd) goto out; ipcd->tfm = crypto_alloc_acomp(x->calg->alg_name, 0, 0); if (IS_ERR(ipcd->tfm)) goto error; calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); BUG_ON(!calg_desc); ipcd->threshold = calg_desc->uinfo.comp.threshold; x->data = ipcd; err = 0; out: return err; error: ipcomp_free_data(ipcd); kfree(ipcd); goto out; } EXPORT_SYMBOL_GPL(ipcomp_init_state); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
29 7 29 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions and Declarations for tuple. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_tuple.h */ #ifndef _NF_CONNTRACK_TUPLE_H #define _NF_CONNTRACK_TUPLE_H #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/list_nulls.h> /* A `tuple' is a structure containing the information to uniquely identify a connection. ie. if two packets have the same tuple, they are in the same connection; if not, they are not. We divide the structure along "manipulatable" and "non-manipulatable" lines, for the benefit of the NAT code. */ #define NF_CT_TUPLE_L3SIZE ARRAY_SIZE(((union nf_inet_addr *)NULL)->all) /* The manipulable part of the tuple. */ struct nf_conntrack_man { union nf_inet_addr u3; union nf_conntrack_man_proto u; /* Layer 3 protocol */ u_int16_t l3num; }; /* This contains the information to distinguish a connection. */ struct nf_conntrack_tuple { struct nf_conntrack_man src; /* These are the parts of the tuple which are fixed. */ struct { union nf_inet_addr u3; union { /* Add other protocols here. */ __be16 all; struct { __be16 port; } tcp; struct { __be16 port; } udp; struct { u_int8_t type, code; } icmp; struct { __be16 port; } dccp; struct { __be16 port; } sctp; struct { __be16 key; } gre; } u; /* The protocol. */ u_int8_t protonum; /* The direction must be ignored for the tuplehash */ struct { } __nfct_hash_offsetend; /* The direction (for tuplehash) */ u_int8_t dir; } dst; }; struct nf_conntrack_tuple_mask { struct { union nf_inet_addr u3; union nf_conntrack_man_proto u; } src; }; static inline void nf_ct_dump_tuple_ip(const struct nf_conntrack_tuple *t) { #ifdef DEBUG printk("tuple %p: %u %pI4:%hu -> %pI4:%hu\n", t, t->dst.protonum, &t->src.u3.ip, ntohs(t->src.u.all), &t->dst.u3.ip, ntohs(t->dst.u.all)); #endif } static inline void nf_ct_dump_tuple_ipv6(const struct nf_conntrack_tuple *t) { #ifdef DEBUG printk("tuple %p: %u %pI6 %hu -> %pI6 %hu\n", t, t->dst.protonum, t->src.u3.all, ntohs(t->src.u.all), t->dst.u3.all, ntohs(t->dst.u.all)); #endif } static inline void nf_ct_dump_tuple(const struct nf_conntrack_tuple *t) { switch (t->src.l3num) { case AF_INET: nf_ct_dump_tuple_ip(t); break; case AF_INET6: nf_ct_dump_tuple_ipv6(t); break; } } /* If we're the first tuple, it's the original dir. */ #define NF_CT_DIRECTION(h) \ ((enum ip_conntrack_dir)(h)->tuple.dst.dir) /* Connections have two entries in the hash table: one for each way */ struct nf_conntrack_tuple_hash { struct hlist_nulls_node hnnode; struct nf_conntrack_tuple tuple; }; static inline bool __nf_ct_tuple_src_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) { return (nf_inet_addr_cmp(&t1->src.u3, &t2->src.u3) && t1->src.u.all == t2->src.u.all && t1->src.l3num == t2->src.l3num); } static inline bool __nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) { return (nf_inet_addr_cmp(&t1->dst.u3, &t2->dst.u3) && t1->dst.u.all == t2->dst.u.all && t1->dst.protonum == t2->dst.protonum); } static inline bool nf_ct_tuple_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) { return __nf_ct_tuple_src_equal(t1, t2) && __nf_ct_tuple_dst_equal(t1, t2); } static inline bool nf_ct_tuple_mask_equal(const struct nf_conntrack_tuple_mask *m1, const struct nf_conntrack_tuple_mask *m2) { return (nf_inet_addr_cmp(&m1->src.u3, &m2->src.u3) && m1->src.u.all == m2->src.u.all); } static inline bool nf_ct_tuple_src_mask_cmp(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2, const struct nf_conntrack_tuple_mask *mask) { int count; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++) { if ((t1->src.u3.all[count] ^ t2->src.u3.all[count]) & mask->src.u3.all[count]) return false; } if ((t1->src.u.all ^ t2->src.u.all) & mask->src.u.all) return false; if (t1->src.l3num != t2->src.l3num || t1->dst.protonum != t2->dst.protonum) return false; return true; } static inline bool nf_ct_tuple_mask_cmp(const struct nf_conntrack_tuple *t, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { return nf_ct_tuple_src_mask_cmp(t, tuple, mask) && __nf_ct_tuple_dst_equal(t, tuple); } #endif /* _NF_CONNTRACK_TUPLE_H */
6611 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _ASM_X86_APIC_H #define _ASM_X86_APIC_H #include <linux/cpumask.h> #include <linux/static_call.h> #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/apicdef.h> #include <linux/atomic.h> #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> #include <asm/hardirq.h> #include <asm/io.h> #include <asm/posted_intr.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 /* Macros for apic_extnmi which controls external NMI masking */ #define APIC_EXTNMI_BSP 0 /* Default */ #define APIC_EXTNMI_ALL 1 #define APIC_EXTNMI_NONE 2 /* * Debugging macros */ #define APIC_QUIET 0 #define APIC_VERBOSE 1 #define APIC_DEBUG 2 /* * Define the default level of output to be very little This can be turned * up by using apic=verbose for more information and apic=debug for _lots_ * of information. apic_verbosity is defined in apic.c */ #define apic_printk(v, s, a...) \ do { \ if ((v) <= apic_verbosity) \ printk(s, ##a); \ } while (0) #define apic_pr_verbose(s, a...) apic_printk(APIC_VERBOSE, KERN_INFO s, ##a) #define apic_pr_debug(s, a...) apic_printk(APIC_DEBUG, KERN_DEBUG s, ##a) #define apic_pr_debug_cont(s, a...) apic_printk(APIC_DEBUG, KERN_CONT s, ##a) /* Unconditional debug prints for code which is guarded by apic_verbosity already */ #define apic_dbg(s, a...) printk(KERN_DEBUG s, ##a) #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) extern void x86_32_probe_apic(void); #else static inline void x86_32_probe_apic(void) { } #endif extern u32 cpuid_to_apicid[]; #define CPU_ACPIID_INVALID U32_MAX #ifdef CONFIG_X86_LOCAL_APIC extern int apic_verbosity; extern int local_apic_timer_c2_ok; extern bool apic_is_disabled; extern unsigned int lapic_timer_period; extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { APIC_PIC, APIC_VIRTUAL_WIRE, APIC_VIRTUAL_WIRE_NO_CONFIG, APIC_SYMMETRIC_IO, APIC_SYMMETRIC_IO_NO_ROUTING }; /* * With 82489DX we can't rely on apic feature bit * retrieved via cpuid but still have to deal with * such an apic chip so we assume that SMP configuration * is found from MP table (64bit case uses ACPI mostly * which set smp presence flag as well so we are safe * to use this helper too). */ static inline bool apic_from_smp_config(void) { return smp_found_config && !apic_is_disabled; } /* * Basic functions accessing APICs. */ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, ASM_OUTPUT("=r" (v), "=m" (*addr)), ASM_INPUT("0" (v), "m" (*addr))); } static inline u32 native_apic_mem_read(u32 reg) { return readl((void __iomem *)(APIC_BASE + reg)); } static inline void native_apic_mem_eoi(void) { native_apic_mem_write(APIC_EOI, APIC_EOI_ACK); } extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); static inline bool apic_is_x2apic_enabled(void) { u64 msr; if (rdmsrq_safe(MSR_IA32_APICBASE, &msr)) return false; return msr & X2APIC_ENABLE; } extern void enable_IR_x2apic(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void apic_soft_disable(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void apic_intr_mode_select(void); extern void apic_intr_mode_init(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern void lapic_update_tsc_freq(void); #ifdef CONFIG_X86_64 static inline bool apic_force_enable(unsigned long addr) { return false; } #else extern bool apic_force_enable(unsigned long addr); #endif extern void apic_ap_setup(void); /* * On 32bit this is mach-xxx local */ #ifdef CONFIG_X86_64 extern int apic_is_clustered_box(void); #else static inline int apic_is_clustered_box(void) { return 0; } #endif extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); extern void lapic_assign_system_vectors(void); extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); extern void lapic_update_legacy_vectors(void); extern void lapic_online(void); extern void lapic_offline(void); extern bool apic_needs_pit(void); extern void apic_send_IPI_allbutself(unsigned int vector); extern void topology_register_apic(u32 apic_id, u32 acpi_id, bool present); extern void topology_register_boot_apic(u32 apic_id); extern int topology_hotplug_apic(u32 apic_id, u32 acpi_id); extern void topology_hotunplug_apic(unsigned int cpu); extern void topology_apply_cmdline_limits_early(void); extern void topology_init_possible_cpus(void); extern void topology_reset_possible_cpus_up(void); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop static inline void lapic_update_tsc_freq(void) { } static inline void init_bsp_APIC(void) { } static inline void apic_intr_mode_select(void) { } static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } static inline bool apic_needs_pit(void) { return true; } static inline void topology_apply_cmdline_limits_early(void) { } static inline void topology_init_possible_cpus(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC static inline void native_apic_msr_write(u32 reg, u32 v) { if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || reg == APIC_LVR) return; wrmsrq(APIC_BASE_MSR + (reg >> 4), v); } static inline void native_apic_msr_eoi(void) { native_wrmsrq(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK); } static inline u32 native_apic_msr_read(u32 reg) { u64 msr; if (reg == APIC_DFR) return -1; rdmsrq(APIC_BASE_MSR + (reg >> 4), msr); return (u32)msr; } static inline void native_x2apic_icr_write(u32 low, u32 id) { wrmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); } static inline u64 native_x2apic_icr_read(void) { unsigned long val; rdmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), val); return val; } extern int x2apic_mode; extern int x2apic_phys; extern void __init x2apic_set_max_apicid(u32 apicid); extern void x2apic_setup(void); static inline int x2apic_enabled(void) { return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled(); } #define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC)) #else /* !CONFIG_X86_X2APIC */ static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } static inline u32 native_apic_msr_read(u32 reg) { BUG(); } #define x2apic_mode (0) #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ extern void __init check_x2apic(void); struct irq_data; /* * Copyright 2004 James Cleverdon, IBM. * * Generic APIC sub-arch data struct. * * Hacked for x86-64 by James Cleverdon from i386 architecture code by * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ struct apic { /* Hotpath functions first */ void (*eoi)(void); void (*native_eoi)(void); void (*write)(u32 reg, u32 v); u32 (*read)(u32 reg); /* IPI related functions */ void (*wait_icr_idle)(void); u32 (*safe_wait_icr_idle)(void); void (*send_IPI)(int cpu, int vector); void (*send_IPI_mask)(const struct cpumask *mask, int vector); void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); u32 disable_esr : 1, dest_mode_logical : 1, x2apic_set_max_apicid : 1, nmi_to_offline_cpu : 1; u32 (*calc_dest_apicid)(unsigned int cpu); /* ICR related functions */ u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); /* The limit of the APIC ID space. */ u32 max_apic_id; /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); void (*init_apic_ldr)(void); u32 (*cpu_present_to_apicid)(int mps_cpu); u32 (*get_apic_id)(u32 id); /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu); /* wakeup secondary CPU using 64-bit wakeup point */ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); char *name; }; struct apic_override { void (*eoi)(void); void (*native_eoi)(void); void (*write)(u32 reg, u32 v); u32 (*read)(u32 reg); void (*send_IPI)(int cpu, int vector); void (*send_IPI_mask)(const struct cpumask *mask, int vector); void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu); int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); }; /* * Pointer to the local APIC driver in use on this system (there's * always just one such driver in use - the kernel decides via an * early probing process which one it picks - and then sticks to it): */ extern struct apic *apic; /* * APIC drivers are probed based on how they are listed in the .apicdrivers * section. So the order is important and enforced by the ordering * of different apic driver files in the Makefile. */ #define apic_driver(sym) \ static const struct apic *__apicdrivers_##sym __used \ __aligned(sizeof(struct apic *)) \ __section(".apicdrivers") = { &sym } extern struct apic *__apicdrivers[], *__apicdrivers_end[]; /* * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP extern int lapic_can_unplug_cpu(void); #endif #ifdef CONFIG_X86_LOCAL_APIC extern struct apic_override __x86_apic_override; void __init apic_setup_apic_calls(void); void __init apic_install_driver(struct apic *driver); #define apic_update_callback(_callback, _fn) { \ __x86_apic_override._callback = _fn; \ apic->_callback = _fn; \ static_call_update(apic_call_##_callback, _fn); \ pr_info("APIC: %s() replaced with %ps()\n", #_callback, _fn); \ } #define DECLARE_APIC_CALL(__cb) \ DECLARE_STATIC_CALL(apic_call_##__cb, *apic->__cb) DECLARE_APIC_CALL(eoi); DECLARE_APIC_CALL(native_eoi); DECLARE_APIC_CALL(icr_read); DECLARE_APIC_CALL(icr_write); DECLARE_APIC_CALL(read); DECLARE_APIC_CALL(send_IPI); DECLARE_APIC_CALL(send_IPI_mask); DECLARE_APIC_CALL(send_IPI_mask_allbutself); DECLARE_APIC_CALL(send_IPI_allbutself); DECLARE_APIC_CALL(send_IPI_all); DECLARE_APIC_CALL(send_IPI_self); DECLARE_APIC_CALL(wait_icr_idle); DECLARE_APIC_CALL(wakeup_secondary_cpu); DECLARE_APIC_CALL(wakeup_secondary_cpu_64); DECLARE_APIC_CALL(write); static __always_inline u32 apic_read(u32 reg) { return static_call(apic_call_read)(reg); } static __always_inline void apic_write(u32 reg, u32 val) { static_call(apic_call_write)(reg, val); } static __always_inline void apic_eoi(void) { static_call(apic_call_eoi)(); } static __always_inline void apic_native_eoi(void) { static_call(apic_call_native_eoi)(); } static __always_inline u64 apic_icr_read(void) { return static_call(apic_call_icr_read)(); } static __always_inline void apic_icr_write(u32 low, u32 high) { static_call(apic_call_icr_write)(low, high); } static __always_inline void __apic_send_IPI(int cpu, int vector) { static_call(apic_call_send_IPI)(cpu, vector); } static __always_inline void __apic_send_IPI_mask(const struct cpumask *mask, int vector) { static_call_mod(apic_call_send_IPI_mask)(mask, vector); } static __always_inline void __apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { static_call(apic_call_send_IPI_mask_allbutself)(mask, vector); } static __always_inline void __apic_send_IPI_allbutself(int vector) { static_call(apic_call_send_IPI_allbutself)(vector); } static __always_inline void __apic_send_IPI_all(int vector) { static_call(apic_call_send_IPI_all)(vector); } static __always_inline void __apic_send_IPI_self(int vector) { static_call_mod(apic_call_send_IPI_self)(vector); } static __always_inline void apic_wait_icr_idle(void) { static_call_cond(apic_call_wait_icr_idle)(); } static __always_inline u32 safe_apic_wait_icr_idle(void) { return apic->safe_wait_icr_idle ? apic->safe_wait_icr_idle() : 0; } static __always_inline bool apic_id_valid(u32 apic_id) { return apic_id <= apic->max_apic_id; } #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } static inline void apic_write(u32 reg, u32 val) { } static inline void apic_eoi(void) { } static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } static inline void apic_setup_apic_calls(void) { } #define apic_update_callback(_callback, _fn) do { } while (0) #endif /* CONFIG_X86_LOCAL_APIC */ extern void apic_ack_irq(struct irq_data *data); #define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32) #define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10) static inline bool lapic_vector_set_in_irr(unsigned int vector) { u32 irr = apic_read(APIC_IRR + APIC_VECTOR_TO_REG_OFFSET(vector)); return !!(irr & (1U << APIC_VECTOR_TO_BIT_NUMBER(vector))); } static inline bool is_vector_pending(unsigned int vector) { return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector); } #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 /* * Vector states are maintained by APIC in 32-bit registers that are * 16 bytes aligned. The status of each vector is kept in a single * bit. */ static inline int apic_find_highest_vector(void *bitmap) { int vec; u32 *reg; for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) { reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec); if (*reg) return __fls(*reg) + vec; } return -1; } static inline u32 apic_get_reg(void *regs, int reg) { return *((u32 *) (regs + reg)); } static inline void apic_set_reg(void *regs, int reg, u32 val) { *((u32 *) (regs + reg)) = val; } static __always_inline u64 apic_get_reg64(void *regs, int reg) { BUILD_BUG_ON(reg != APIC_ICR); return *((u64 *) (regs + reg)); } static __always_inline void apic_set_reg64(void *regs, int reg, u64 val) { BUILD_BUG_ON(reg != APIC_ICR); *((u64 *) (regs + reg)) = val; } static inline void apic_clear_vector(int vec, void *bitmap) { clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec)); } static inline void apic_set_vector(int vec, void *bitmap) { set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec)); } static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec)); } /* * Warm reset vector position: */ #define TRAMPOLINE_PHYS_LOW 0x467 #define TRAMPOLINE_PHYS_HIGH 0x469 #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> extern struct apic apic_noop; static inline u32 read_apic_id(void) { u32 reg = apic_read(APIC_ID); return apic->get_apic_id(reg); } #ifdef CONFIG_X86_64 typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip); extern int default_acpi_madt_oem_check(char *, char *); extern void x86_64_probe_apic(void); #else static inline int default_acpi_madt_oem_check(char *a, char *b) { return 0; } static inline void x86_64_probe_apic(void) { } #endif extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); extern u32 default_cpu_present_to_apicid(int mps_cpu); void apic_send_nmi_to_offline_cpu(unsigned int cpu); #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 read_apic_id(void) { return 0; } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_SMP void apic_smt_update(void); #else static inline void apic_smt_update(void) { } #endif struct msi_msg; struct irq_cfg; extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, bool dmar); extern void ioapic_zap_locks(void); #endif /* _ASM_X86_APIC_H */
313 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SHA-256 optimized for x86_64 * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <crypto/internal/simd.h> #include <linux/static_call.h> DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha256_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha256_block_state *state, const u8 *data, \ size_t nblocks) \ { \ if (likely(crypto_simd_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha256_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3); DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx); DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx); DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform); static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { static_call(sha256_blocks_x86)(state, data, nblocks); } #define sha256_mod_init_arch sha256_mod_init_arch static inline void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha256_blocks_x86, sha256_blocks_avx2); else static_call_update(sha256_blocks_x86, sha256_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha256_blocks_x86, sha256_blocks_ssse3); } }
596 265 1 1 1 1 37 63 2 4 3 1107 567 1000 2 17 1147 30 1 105 1227 940 15 15 813 940 939 939 940 880 642 939 814 137 812 811 591 5 677 682 682 682 682 5 677 17 35 1 2 940 940 2 940 940 940 940 1 940 44 815 677 319 813 815 315 682 682 628 628 628 608 608 782 62 35 142 7 338 87 190 2 508 95 1230 1229 1231 40 1138 51 333 880 879 160 596 591 591 590 591 539 2 595 595 62 4 551 68 68 5 44 5 19 44 44 11 1 32 556 427 143 5 5 5 5 10 10 10 231 27 98 120 323 10 306 7 34 34 34 34 27 7 33 34 18 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 // SPDX-License-Identifier: GPL-2.0-only /* * 9P Client * * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/idr.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/uio.h> #include <linux/netfs.h> #include <net/9p/9p.h> #include <linux/parser.h> #include <linux/seq_file.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include "protocol.h" #define CREATE_TRACE_POINTS #include <trace/events/9p.h> /* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ + * room for write (16 extra) or read (11 extra) operands. */ #define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ) /* Client Option Parsing (code inspired by NFS code) * - a little lazy - parse all client options */ enum { Opt_msize, Opt_trans, Opt_legacy, Opt_version, Opt_err, }; static const match_table_t tokens = { {Opt_msize, "msize=%u"}, {Opt_legacy, "noextend"}, {Opt_trans, "trans=%s"}, {Opt_version, "version=%s"}, {Opt_err, NULL}, }; inline int p9_is_proto_dotl(struct p9_client *clnt) { return clnt->proto_version == p9_proto_2000L; } EXPORT_SYMBOL(p9_is_proto_dotl); inline int p9_is_proto_dotu(struct p9_client *clnt) { return clnt->proto_version == p9_proto_2000u; } EXPORT_SYMBOL(p9_is_proto_dotu); int p9_show_client_options(struct seq_file *m, struct p9_client *clnt) { if (clnt->msize != DEFAULT_MSIZE) seq_printf(m, ",msize=%u", clnt->msize); seq_printf(m, ",trans=%s", clnt->trans_mod->name); switch (clnt->proto_version) { case p9_proto_legacy: seq_puts(m, ",noextend"); break; case p9_proto_2000u: seq_puts(m, ",version=9p2000.u"); break; case p9_proto_2000L: /* Default */ break; } if (clnt->trans_mod->show_options) return clnt->trans_mod->show_options(m, clnt); return 0; } EXPORT_SYMBOL(p9_show_client_options); /* Some error codes are taken directly from the server replies, * make sure they are valid. */ static int safe_errno(int err) { if (err > 0 || err < -MAX_ERRNO) { p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err); return -EPROTO; } return err; } /* Interpret mount option for protocol version */ static int get_protocol_version(char *s) { int version = -EINVAL; if (!strcmp(s, "9p2000")) { version = p9_proto_legacy; p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n"); } else if (!strcmp(s, "9p2000.u")) { version = p9_proto_2000u; p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n"); } else if (!strcmp(s, "9p2000.L")) { version = p9_proto_2000L; p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n"); } else { pr_info("Unknown protocol version %s\n", s); } return version; } /** * parse_opts - parse mount options into client structure * @opts: options string passed from mount * @clnt: existing v9fs client information * * Return 0 upon success, -ERRNO upon failure */ static int parse_opts(char *opts, struct p9_client *clnt) { char *options, *tmp_options; char *p; substring_t args[MAX_OPT_ARGS]; int option; char *s; int ret = 0; clnt->proto_version = p9_proto_2000L; clnt->msize = DEFAULT_MSIZE; if (!opts) return 0; tmp_options = kstrdup(opts, GFP_KERNEL); if (!tmp_options) return -ENOMEM; options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { int token, r; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_msize: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; continue; } if (option < 4096) { p9_debug(P9_DEBUG_ERROR, "msize should be at least 4k\n"); ret = -EINVAL; continue; } clnt->msize = option; break; case Opt_trans: s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; p9_debug(P9_DEBUG_ERROR, "problem allocating copy of trans arg\n"); goto free_and_return; } v9fs_put_trans(clnt->trans_mod); clnt->trans_mod = v9fs_get_trans_by_name(s); if (!clnt->trans_mod) { pr_info("Could not find request transport: %s\n", s); ret = -EINVAL; } kfree(s); break; case Opt_legacy: clnt->proto_version = p9_proto_legacy; break; case Opt_version: s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; p9_debug(P9_DEBUG_ERROR, "problem allocating copy of version arg\n"); goto free_and_return; } r = get_protocol_version(s); if (r < 0) ret = r; else clnt->proto_version = r; kfree(s); break; default: continue; } } free_and_return: if (ret) v9fs_put_trans(clnt->trans_mod); kfree(tmp_options); return ret; } static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, int alloc_msize) { if (likely(c->fcall_cache) && alloc_msize == c->msize) { fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); fc->cache = c->fcall_cache; } else { fc->sdata = kmalloc(alloc_msize, GFP_NOFS); fc->cache = NULL; } if (!fc->sdata) return -ENOMEM; fc->capacity = alloc_msize; fc->id = 0; fc->tag = P9_NOTAG; return 0; } void p9_fcall_fini(struct p9_fcall *fc) { /* sdata can be NULL for interrupted requests in trans_rdma, * and kmem_cache_free does not do NULL-check for us */ if (unlikely(!fc->sdata)) return; if (fc->cache) kmem_cache_free(fc->cache, fc->sdata); else kfree(fc->sdata); } EXPORT_SYMBOL(p9_fcall_fini); static struct kmem_cache *p9_req_cache; /** * p9_tag_alloc - Allocate a new request. * @c: Client session. * @type: Transaction type. * @t_size: Buffer size for holding this request * (automatic calculation by format template if 0). * @r_size: Buffer size for holding server's reply on this request * (automatic calculation by format template if 0). * @fmt: Format template for assembling 9p request message * (see p9pdu_vwritef). * @ap: Variable arguments to be fed to passed format template * (see p9pdu_vwritef). * * Context: Process context. * Return: Pointer to new request. */ static struct p9_req_t * p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size, const char *fmt, va_list ap) { struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS); int alloc_tsize; int alloc_rsize; int tag; va_list apc; va_copy(apc, ap); alloc_tsize = min_t(size_t, c->msize, t_size ?: p9_msg_buf_size(c, type, fmt, apc)); va_end(apc); alloc_rsize = min_t(size_t, c->msize, r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap)); if (!req) return ERR_PTR(-ENOMEM); if (p9_fcall_init(c, &req->tc, alloc_tsize)) goto free_req; if (p9_fcall_init(c, &req->rc, alloc_rsize)) goto free; p9pdu_reset(&req->tc); p9pdu_reset(&req->rc); req->t_err = 0; req->status = REQ_STATUS_ALLOC; /* refcount needs to be set to 0 before inserting into the idr * so p9_tag_lookup does not accept a request that is not fully * initialized. refcount_set to 2 below will mark request ready. */ refcount_set(&req->refcount, 0); init_waitqueue_head(&req->wq); INIT_LIST_HEAD(&req->req_list); idr_preload(GFP_NOFS); spin_lock_irq(&c->lock); if (type == P9_TVERSION) tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1, GFP_NOWAIT); else tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); req->tc.tag = tag; spin_unlock_irq(&c->lock); idr_preload_end(); if (tag < 0) goto free; /* Init ref to two because in the general case there is one ref * that is put asynchronously by a writer thread, one ref * temporarily given by p9_tag_lookup and put by p9_client_cb * in the recv thread, and one ref put by p9_req_put in the * main thread. The only exception is virtio that does not use * p9_tag_lookup but does not have a writer thread either * (the write happens synchronously in the request/zc_request * callback), so p9_client_cb eats the second ref there * as the pointer is duplicated directly by virtqueue_add_sgs() */ refcount_set(&req->refcount, 2); return req; free: p9_fcall_fini(&req->tc); p9_fcall_fini(&req->rc); free_req: kmem_cache_free(p9_req_cache, req); return ERR_PTR(-ENOMEM); } /** * p9_tag_lookup - Look up a request by tag. * @c: Client session. * @tag: Transaction ID. * * Context: Any context. * Return: A request, or %NULL if there is no request with that tag. */ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) { struct p9_req_t *req; rcu_read_lock(); again: req = idr_find(&c->reqs, tag); if (req) { /* We have to be careful with the req found under rcu_read_lock * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the * ref again without corrupting other data, then check again * that the tag matches once we have the ref */ if (!p9_req_try_get(req)) goto again; if (req->tc.tag != tag) { p9_req_put(c, req); goto again; } } rcu_read_unlock(); return req; } EXPORT_SYMBOL(p9_tag_lookup); /** * p9_tag_remove - Remove a tag. * @c: Client session. * @r: Request of reference. * * Context: Any context. */ static void p9_tag_remove(struct p9_client *c, struct p9_req_t *r) { unsigned long flags; u16 tag = r->tc.tag; p9_debug(P9_DEBUG_MUX, "freeing clnt %p req %p tag: %d\n", c, r, tag); spin_lock_irqsave(&c->lock, flags); idr_remove(&c->reqs, tag); spin_unlock_irqrestore(&c->lock, flags); } int p9_req_put(struct p9_client *c, struct p9_req_t *r) { if (refcount_dec_and_test(&r->refcount)) { p9_tag_remove(c, r); p9_fcall_fini(&r->tc); p9_fcall_fini(&r->rc); kmem_cache_free(p9_req_cache, r); return 1; } return 0; } EXPORT_SYMBOL(p9_req_put); /** * p9_tag_cleanup - cleans up tags structure and reclaims resources * @c: v9fs client struct * * This frees resources associated with the tags structure * */ static void p9_tag_cleanup(struct p9_client *c) { struct p9_req_t *req; int id; rcu_read_lock(); idr_for_each_entry(&c->reqs, req, id) { pr_info("Tag %d still in use\n", id); if (p9_req_put(c, req) == 0) pr_warn("Packet with tag %d has still references", req->tc.tag); } rcu_read_unlock(); } /** * p9_client_cb - call back from transport to client * @c: client state * @req: request received * @status: request status, one of REQ_STATUS_* * */ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) { p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag); /* This barrier is needed to make sure any change made to req before * the status change is visible to another thread */ smp_wmb(); WRITE_ONCE(req->status, status); wake_up(&req->wq); p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); p9_req_put(c, req); } EXPORT_SYMBOL(p9_client_cb); /** * p9_parse_header - parse header arguments out of a packet * @pdu: packet to parse * @size: size of packet * @type: type of request * @tag: tag of packet * @rewind: set if we need to rewind offset afterwards */ int p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag, int rewind) { s8 r_type; s16 r_tag; s32 r_size; int offset = pdu->offset; int err; pdu->offset = 0; err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag); if (err) goto rewind_and_exit; if (type) *type = r_type; if (tag) *tag = r_tag; if (size) *size = r_size; if (pdu->size != r_size || r_size < 7) { err = -EINVAL; goto rewind_and_exit; } pdu->id = r_type; pdu->tag = r_tag; p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n", pdu->size, pdu->id, pdu->tag); rewind_and_exit: if (rewind) pdu->offset = offset; return err; } EXPORT_SYMBOL(p9_parse_header); /** * p9_check_errors - check 9p packet for error return and process it * @c: current client instance * @req: request to parse and check for error conditions * * returns error code if one is discovered, otherwise returns 0 * * this will have to be more complicated if we have multiple * error packet types */ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) { s8 type; int err; int ecode; err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); if (req->rc.size > req->rc.capacity && !req->rc.zc) { pr_err("requested packet size too big: %d does not fit %zu (type=%d)\n", req->rc.size, req->rc.capacity, req->rc.id); return -EIO; } /* dump the response from server * This should be after check errors which poplulate pdu_fcall. */ trace_9p_protocol_dump(c, &req->rc); if (err) { p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; } if (type != P9_RERROR && type != P9_RLERROR) return 0; if (!p9_is_proto_dotl(c)) { char *ename = NULL; err = p9pdu_readf(&req->rc, c->proto_version, "s?d", &ename, &ecode); if (err) { kfree(ename); goto out_err; } if (p9_is_proto_dotu(c) && ecode < 512) err = -ecode; if (!err) { err = p9_errstr2errno(ename, strlen(ename)); p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, ename); } kfree(ename); } else { err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); if (err) goto out_err; err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); } return err; out_err: p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err); return err; } static struct p9_req_t * p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...); /** * p9_client_flush - flush (cancel) a request * @c: client state * @oldreq: request to cancel * * This sents a flush for a particular request and links * the flush request to the original request. The current * code only supports a single flush request although the protocol * allows for multiple flush requests to be sent for a single request. * */ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) { struct p9_req_t *req; s16 oldtag; int err; err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1); if (err) return err; p9_debug(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag); req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag); if (IS_ERR(req)) return PTR_ERR(req); /* if we haven't received a response for oldreq, * remove it from the list */ if (READ_ONCE(oldreq->status) == REQ_STATUS_SENT) { if (c->trans_mod->cancelled) c->trans_mod->cancelled(c, oldreq); } p9_req_put(c, req); return 0; } static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, int8_t type, uint t_size, uint r_size, const char *fmt, va_list ap) { int err; struct p9_req_t *req; va_list apc; p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type); /* we allow for any status other than disconnected */ if (c->status == Disconnected) return ERR_PTR(-EIO); /* if status is begin_disconnected we allow only clunk request */ if (c->status == BeginDisconnect && type != P9_TCLUNK) return ERR_PTR(-EIO); va_copy(apc, ap); req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc); va_end(apc); if (IS_ERR(req)) return req; /* marshall the data */ p9pdu_prepare(&req->tc, req->tc.tag, type); err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap); if (err) goto reterr; p9pdu_finalize(c, &req->tc); trace_9p_client_req(c, type, req->tc.tag); return req; reterr: p9_req_put(c, req); /* We have to put also the 2nd reference as it won't be used */ p9_req_put(c, req); return ERR_PTR(err); } /** * p9_client_rpc - issue a request and wait for a response * @c: client session * @type: type of request * @fmt: protocol format string (see protocol.c) * * Returns request structure (which client must free using p9_req_put) */ static struct p9_req_t * p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) { va_list ap; int sigpending, err; unsigned long flags; struct p9_req_t *req; /* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to * auto determine an appropriate (small) request/response size * according to actual message data being sent. Currently RDMA * transport is excluded from this response message size optimization, * as it would not cope with it, due to its pooled response buffers * (using an optimized request size for RDMA as well though). */ const uint tsize = 0; const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0; va_start(ap, fmt); req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap); va_end(ap); if (IS_ERR(req)) return req; req->tc.zc = false; req->rc.zc = false; if (signal_pending(current)) { sigpending = 1; clear_thread_flag(TIF_SIGPENDING); } else { sigpending = 0; } err = c->trans_mod->request(c, req); if (err < 0) { /* write won't happen */ p9_req_put(c, req); if (err != -ERESTARTSYS && err != -EFAULT) c->status = Disconnected; goto recalc_sigpending; } again: /* Wait for the response */ err = wait_event_killable(req->wq, READ_ONCE(req->status) >= REQ_STATUS_RCVD); /* Make sure our req is coherent with regard to updates in other * threads - echoes to wmb() in the callback */ smp_rmb(); if (err == -ERESTARTSYS && c->status == Connected && type == P9_TFLUSH) { sigpending = 1; clear_thread_flag(TIF_SIGPENDING); goto again; } if (READ_ONCE(req->status) == REQ_STATUS_ERROR) { p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); err = req->t_err; } if (err == -ERESTARTSYS && c->status == Connected) { p9_debug(P9_DEBUG_MUX, "flushing\n"); sigpending = 1; clear_thread_flag(TIF_SIGPENDING); if (c->trans_mod->cancel(c, req)) p9_client_flush(c, req); /* if we received the response anyway, don't signal error */ if (READ_ONCE(req->status) == REQ_STATUS_RCVD) err = 0; } recalc_sigpending: if (sigpending) { spin_lock_irqsave(&current->sighand->siglock, flags); recalc_sigpending(); spin_unlock_irqrestore(&current->sighand->siglock, flags); } if (err < 0) goto reterr; err = p9_check_errors(c, req); trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: p9_req_put(c, req); return ERR_PTR(safe_errno(err)); } /** * p9_client_zc_rpc - issue a request and wait for a response * @c: client session * @type: type of request * @uidata: destination for zero copy read * @uodata: source for zero copy write * @inlen: read buffer size * @olen: write buffer size * @in_hdrlen: reader header size, This is the size of response protocol data * @fmt: protocol format string (see protocol.c) * * Returns request structure (which client must free using p9_req_put) */ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, struct iov_iter *uidata, struct iov_iter *uodata, int inlen, int olen, int in_hdrlen, const char *fmt, ...) { va_list ap; int sigpending, err; unsigned long flags; struct p9_req_t *req; va_start(ap, fmt); /* We allocate a inline protocol data of only 4k bytes. * The actual content is passed in zero-copy fashion. */ req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, P9_ZC_HDR_SZ, fmt, ap); va_end(ap); if (IS_ERR(req)) return req; req->tc.zc = true; req->rc.zc = true; if (signal_pending(current)) { sigpending = 1; clear_thread_flag(TIF_SIGPENDING); } else { sigpending = 0; } err = c->trans_mod->zc_request(c, req, uidata, uodata, inlen, olen, in_hdrlen); if (err < 0) { if (err == -EIO) c->status = Disconnected; if (err != -ERESTARTSYS) goto recalc_sigpending; } if (READ_ONCE(req->status) == REQ_STATUS_ERROR) { p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); err = req->t_err; } if (err == -ERESTARTSYS && c->status == Connected) { p9_debug(P9_DEBUG_MUX, "flushing\n"); sigpending = 1; clear_thread_flag(TIF_SIGPENDING); if (c->trans_mod->cancel(c, req)) p9_client_flush(c, req); /* if we received the response anyway, don't signal error */ if (READ_ONCE(req->status) == REQ_STATUS_RCVD) err = 0; } recalc_sigpending: if (sigpending) { spin_lock_irqsave(&current->sighand->siglock, flags); recalc_sigpending(); spin_unlock_irqrestore(&current->sighand->siglock, flags); } if (err < 0) goto reterr; err = p9_check_errors(c, req); trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: p9_req_put(c, req); return ERR_PTR(safe_errno(err)); } static struct p9_fid *p9_fid_create(struct p9_client *clnt) { int ret; struct p9_fid *fid; p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt); fid = kzalloc(sizeof(*fid), GFP_KERNEL); if (!fid) return NULL; fid->mode = -1; fid->uid = current_fsuid(); fid->clnt = clnt; refcount_set(&fid->count, 1); idr_preload(GFP_KERNEL); spin_lock_irq(&clnt->lock); ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1, GFP_NOWAIT); spin_unlock_irq(&clnt->lock); idr_preload_end(); if (!ret) { trace_9p_fid_ref(fid, P9_FID_REF_CREATE); return fid; } kfree(fid); return NULL; } static void p9_fid_destroy(struct p9_fid *fid) { struct p9_client *clnt; unsigned long flags; p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid); trace_9p_fid_ref(fid, P9_FID_REF_DESTROY); clnt = fid->clnt; spin_lock_irqsave(&clnt->lock, flags); idr_remove(&clnt->fids, fid->fid); spin_unlock_irqrestore(&clnt->lock, flags); kfree(fid->rdir); kfree(fid); } /* We also need to export tracepoint symbols for tracepoint_enabled() */ EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref); void do_trace_9p_fid_get(struct p9_fid *fid) { trace_9p_fid_ref(fid, P9_FID_REF_GET); } EXPORT_SYMBOL(do_trace_9p_fid_get); void do_trace_9p_fid_put(struct p9_fid *fid) { trace_9p_fid_ref(fid, P9_FID_REF_PUT); } EXPORT_SYMBOL(do_trace_9p_fid_put); static int p9_client_version(struct p9_client *c) { int err; struct p9_req_t *req; char *version = NULL; int msize; p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n", c->msize, c->proto_version); switch (c->proto_version) { case p9_proto_2000L: req = p9_client_rpc(c, P9_TVERSION, "ds", c->msize, "9P2000.L"); break; case p9_proto_2000u: req = p9_client_rpc(c, P9_TVERSION, "ds", c->msize, "9P2000.u"); break; case p9_proto_legacy: req = p9_client_rpc(c, P9_TVERSION, "ds", c->msize, "9P2000"); break; default: return -EINVAL; } if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, c->proto_version, "ds", &msize, &version); if (err) { p9_debug(P9_DEBUG_9P, "version error %d\n", err); trace_9p_protocol_dump(c, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version); if (!strncmp(version, "9P2000.L", 8)) { c->proto_version = p9_proto_2000L; } else if (!strncmp(version, "9P2000.u", 8)) { c->proto_version = p9_proto_2000u; } else if (!strncmp(version, "9P2000", 6)) { c->proto_version = p9_proto_legacy; } else { p9_debug(P9_DEBUG_ERROR, "server returned an unknown version: %s\n", version); err = -EREMOTEIO; goto error; } if (msize < 4096) { p9_debug(P9_DEBUG_ERROR, "server returned a msize < 4096: %d\n", msize); err = -EREMOTEIO; goto error; } if (msize < c->msize) c->msize = msize; error: kfree(version); p9_req_put(c, req); return err; } struct p9_client *p9_client_create(const char *dev_name, char *options) { int err; static atomic_t seqno = ATOMIC_INIT(0); struct p9_client *clnt; char *client_id; char *cache_name; clnt = kmalloc(sizeof(*clnt), GFP_KERNEL); if (!clnt) return ERR_PTR(-ENOMEM); clnt->trans_mod = NULL; clnt->trans = NULL; clnt->fcall_cache = NULL; client_id = utsname()->nodename; memcpy(clnt->name, client_id, strlen(client_id) + 1); spin_lock_init(&clnt->lock); idr_init(&clnt->fids); idr_init(&clnt->reqs); err = parse_opts(options, clnt); if (err < 0) goto free_client; if (!clnt->trans_mod) clnt->trans_mod = v9fs_get_default_trans(); if (!clnt->trans_mod) { err = -EPROTONOSUPPORT; p9_debug(P9_DEBUG_ERROR, "No transport defined or default transport\n"); goto free_client; } p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", clnt, clnt->trans_mod, clnt->msize, clnt->proto_version); err = clnt->trans_mod->create(clnt, dev_name, options); if (err) goto put_trans; if (clnt->msize > clnt->trans_mod->maxsize) { clnt->msize = clnt->trans_mod->maxsize; pr_info("Limiting 'msize' to %d as this is the maximum " "supported by transport %s\n", clnt->msize, clnt->trans_mod->name ); } if (clnt->msize < 4096) { p9_debug(P9_DEBUG_ERROR, "Please specify a msize of at least 4k\n"); err = -EINVAL; goto close_trans; } err = p9_client_version(clnt); if (err) goto close_trans; cache_name = kasprintf(GFP_KERNEL, "9p-fcall-cache-%u", atomic_inc_return(&seqno)); if (!cache_name) { err = -ENOMEM; goto close_trans; } /* P9_HDRSZ + 4 is the smallest packet header we can have that is * followed by data accessed from userspace by read */ clnt->fcall_cache = kmem_cache_create_usercopy(cache_name, clnt->msize, 0, 0, P9_HDRSZ + 4, clnt->msize - (P9_HDRSZ + 4), NULL); kfree(cache_name); return clnt; close_trans: clnt->trans_mod->close(clnt); put_trans: v9fs_put_trans(clnt->trans_mod); free_client: kfree(clnt); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_create); void p9_client_destroy(struct p9_client *clnt) { struct p9_fid *fid; int id; p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt); if (clnt->trans_mod) clnt->trans_mod->close(clnt); v9fs_put_trans(clnt->trans_mod); idr_for_each_entry(&clnt->fids, fid, id) { pr_info("Found fid %d not clunked\n", fid->fid); p9_fid_destroy(fid); } p9_tag_cleanup(clnt); kmem_cache_destroy(clnt->fcall_cache); kfree(clnt); } EXPORT_SYMBOL(p9_client_destroy); void p9_client_disconnect(struct p9_client *clnt) { p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt); clnt->status = Disconnected; } EXPORT_SYMBOL(p9_client_disconnect); void p9_client_begin_disconnect(struct p9_client *clnt) { p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt); clnt->status = BeginDisconnect; } EXPORT_SYMBOL(p9_client_begin_disconnect); struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, const char *uname, kuid_t n_uname, const char *aname) { int err; struct p9_req_t *req; struct p9_fid *fid; struct p9_qid qid; p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n", afid ? afid->fid : -1, uname, aname); fid = p9_fid_create(clnt); if (!fid) { err = -ENOMEM; goto error; } fid->uid = n_uname; req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid, afid ? afid->fid : P9_NOFID, uname, aname, n_uname); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", &qid); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n", qid.type, qid.path, qid.version); memmove(&fid->qid, &qid, sizeof(struct p9_qid)); p9_req_put(clnt, req); return fid; error: if (fid) p9_fid_destroy(fid); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_attach); struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, const unsigned char * const *wnames, int clone) { int err; struct p9_client *clnt; struct p9_fid *fid; struct p9_qid *wqids; struct p9_req_t *req; u16 nwqids, count; wqids = NULL; clnt = oldfid->clnt; if (clone) { fid = p9_fid_create(clnt); if (!fid) { err = -ENOMEM; goto error; } fid->uid = oldfid->uid; } else { fid = oldfid; } p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n", oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL); req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid, nwname, wnames); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "R", &nwqids, &wqids); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); goto clunk_fid; } p9_req_put(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids); if (nwqids != nwname) { err = -ENOENT; goto clunk_fid; } for (count = 0; count < nwqids; count++) p9_debug(P9_DEBUG_9P, "<<< [%d] %x.%llx.%x\n", count, wqids[count].type, wqids[count].path, wqids[count].version); if (nwname) memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid)); else memmove(&fid->qid, &oldfid->qid, sizeof(struct p9_qid)); kfree(wqids); return fid; clunk_fid: kfree(wqids); p9_fid_put(fid); fid = NULL; error: if (fid && fid != oldfid) p9_fid_destroy(fid); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_walk); int p9_client_open(struct p9_fid *fid, int mode) { int err; struct p9_client *clnt; struct p9_req_t *req; struct p9_qid qid; int iounit; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> %s fid %d mode %d\n", p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode); if (fid->mode != -1) return -EINVAL; if (p9_is_proto_dotl(clnt)) req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode & P9L_MODE_MASK); else req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode & P9L_MODE_MASK); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } p9_debug(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n", p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type, qid.path, qid.version, iounit); memmove(&fid->qid, &qid, sizeof(struct p9_qid)); fid->mode = mode; fid->iounit = iounit; free_and_error: p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_open); int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 mode, kgid_t gid, struct p9_qid *qid) { int err; struct p9_client *clnt; struct p9_req_t *req; int iounit; p9_debug(P9_DEBUG_9P, ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n", ofid->fid, name, flags, mode, from_kgid(&init_user_ns, gid)); clnt = ofid->clnt; if (ofid->mode != -1) return -EINVAL; req = p9_client_rpc(clnt, P9_TLCREATE, "dsddg", ofid->fid, name, flags, mode & P9L_MODE_MASK, gid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", qid, &iounit); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } p9_debug(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n", qid->type, qid->path, qid->version, iounit); memmove(&ofid->qid, qid, sizeof(struct p9_qid)); ofid->mode = flags; ofid->iounit = iounit; free_and_error: p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_create_dotl); int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, char *extension) { int err; struct p9_client *clnt; struct p9_req_t *req; struct p9_qid qid; int iounit; p9_debug(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n", fid->fid, name, perm, mode); clnt = fid->clnt; if (fid->mode != -1) return -EINVAL; req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm, mode & P9L_MODE_MASK, extension); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } p9_debug(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n", qid.type, qid.path, qid.version, iounit); memmove(&fid->qid, &qid, sizeof(struct p9_qid)); fid->mode = mode; fid->iounit = iounit; free_and_error: p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_fcreate); int p9_client_symlink(struct p9_fid *dfid, const char *name, const char *symtgt, kgid_t gid, struct p9_qid *qid) { int err; struct p9_client *clnt; struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s symtgt %s\n", dfid->fid, name, symtgt); clnt = dfid->clnt; req = p9_client_rpc(clnt, P9_TSYMLINK, "dssg", dfid->fid, name, symtgt, gid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } p9_debug(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n", qid->type, qid->path, qid->version); free_and_error: p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_symlink); int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newname) { struct p9_client *clnt; struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n", dfid->fid, oldfid->fid, newname); clnt = dfid->clnt; req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid, newname); if (IS_ERR(req)) return PTR_ERR(req); p9_debug(P9_DEBUG_9P, "<<< RLINK\n"); p9_req_put(clnt, req); return 0; } EXPORT_SYMBOL(p9_client_link); int p9_client_fsync(struct p9_fid *fid, int datasync) { int err = 0; struct p9_client *clnt; struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n", fid->fid, datasync); clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid); p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_fsync); int p9_client_clunk(struct p9_fid *fid) { int err = 0; struct p9_client *clnt; struct p9_req_t *req; int retries = 0; again: p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid, retries); clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid); p9_req_put(clnt, req); error: /* Fid is not valid even after a failed clunk * If interrupted, retry once then give up and * leak fid until umount. */ if (err == -ERESTARTSYS) { if (retries++ == 0) goto again; } else { p9_fid_destroy(fid); } return err; } EXPORT_SYMBOL(p9_client_clunk); int p9_client_remove(struct p9_fid *fid) { int err = 0; struct p9_client *clnt; struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid); clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TREMOVE, "d", fid->fid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid); p9_req_put(clnt, req); error: if (err == -ERESTARTSYS) p9_fid_put(fid); else p9_fid_destroy(fid); return err; } EXPORT_SYMBOL(p9_client_remove); int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) { int err = 0; struct p9_req_t *req; struct p9_client *clnt; p9_debug(P9_DEBUG_9P, ">>> TUNLINKAT fid %d %s %d\n", dfid->fid, name, flags); clnt = dfid->clnt; req = p9_client_rpc(clnt, P9_TUNLINKAT, "dsd", dfid->fid, name, flags); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name); p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_unlinkat); int p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) { int total = 0; *err = 0; while (iov_iter_count(to)) { int count; count = p9_client_read_once(fid, offset, to, err); if (!count || *err) break; offset += count; total += count; } return total; } EXPORT_SYMBOL(p9_client_read); int p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) { struct p9_client *clnt = fid->clnt; struct p9_req_t *req; int count = iov_iter_count(to); u32 rsize, received; bool non_zc = false; char *dataptr; *err = 0; p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %zu\n", fid->fid, offset, iov_iter_count(to)); rsize = fid->iounit; if (!rsize || rsize > clnt->msize - P9_IOHDRSZ) rsize = clnt->msize - P9_IOHDRSZ; if (count < rsize) rsize = count; /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && rsize > 1024) { /* response header len is 11 * PDU Header(7) + IO Size (4) */ req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize, 0, 11, "dqd", fid->fid, offset, rsize); } else { non_zc = true; req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); } if (IS_ERR(req)) { *err = PTR_ERR(req); if (!non_zc) iov_iter_revert(to, count - iov_iter_count(to)); return 0; } *err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &received, &dataptr); if (*err) { if (!non_zc) iov_iter_revert(to, count - iov_iter_count(to)); trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); return 0; } if (rsize < received) { pr_err("bogus RREAD count (%u > %u)\n", received, rsize); *err = -EIO; p9_req_put(clnt, req); return 0; } p9_debug(P9_DEBUG_9P, "<<< RREAD count %u\n", received); if (non_zc) { int n = copy_to_iter(dataptr, received, to); if (n != received) { *err = -EFAULT; p9_req_put(clnt, req); return n; } } else { iov_iter_revert(to, count - received - iov_iter_count(to)); } p9_req_put(clnt, req); return received; } EXPORT_SYMBOL(p9_client_read_once); int p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) { struct p9_client *clnt = fid->clnt; struct p9_req_t *req; int total = 0; *err = 0; while (iov_iter_count(from)) { size_t count = iov_iter_count(from); u32 rsize = fid->iounit; u32 written; if (!rsize || rsize > clnt->msize - P9_IOHDRSZ) rsize = clnt->msize - P9_IOHDRSZ; if (count < rsize) rsize = count; p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %u (/%zu)\n", fid->fid, offset, rsize, count); /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && rsize > 1024) { req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0, rsize, P9_ZC_HDR_SZ, "dqd", fid->fid, offset, rsize); } else { req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid, offset, rsize, from); } if (IS_ERR(req)) { iov_iter_revert(from, count - iov_iter_count(from)); *err = PTR_ERR(req); break; } *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written); if (*err) { iov_iter_revert(from, count - iov_iter_count(from)); trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); break; } if (rsize < written) { pr_err("bogus RWRITE count (%u > %u)\n", written, rsize); *err = -EIO; iov_iter_revert(from, count - iov_iter_count(from)); p9_req_put(clnt, req); break; } p9_debug(P9_DEBUG_9P, "<<< RWRITE count %u\n", written); p9_req_put(clnt, req); iov_iter_revert(from, count - written - iov_iter_count(from)); total += written; offset += written; } return total; } EXPORT_SYMBOL(p9_client_write); void p9_client_write_subreq(struct netfs_io_subrequest *subreq) { struct netfs_io_request *wreq = subreq->rreq; struct p9_fid *fid = wreq->netfs_priv; struct p9_client *clnt = fid->clnt; struct p9_req_t *req; unsigned long long start = subreq->start + subreq->transferred; int written, len = subreq->len - subreq->transferred; int err; p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu len %d\n", fid->fid, start, len); /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && len > 1024) { req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, &subreq->io_iter, 0, wreq->len, P9_ZC_HDR_SZ, "dqd", fid->fid, start, len); } else { req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid, start, len, &subreq->io_iter); } if (IS_ERR(req)) { netfs_write_subrequest_terminated(subreq, PTR_ERR(req)); return; } err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); netfs_write_subrequest_terminated(subreq, err); return; } if (written > len) { pr_err("bogus RWRITE count (%d > %u)\n", written, len); written = -EIO; } p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", len); p9_req_put(clnt, req); netfs_write_subrequest_terminated(subreq, written); } EXPORT_SYMBOL(p9_client_write_subreq); struct p9_wstat *p9_client_stat(struct p9_fid *fid) { int err; struct p9_client *clnt; struct p9_wstat *ret; struct p9_req_t *req; u16 ignored; p9_debug(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid); ret = kmalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return ERR_PTR(-ENOMEM); clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TSTAT, "d", fid->fid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "wS", &ignored, ret); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n" "<<< mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n" "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n" "<<< uid=%d gid=%d n_muid=%d\n", ret->size, ret->type, ret->dev, ret->qid.type, ret->qid.path, ret->qid.version, ret->mode, ret->atime, ret->mtime, ret->length, ret->name, ret->uid, ret->gid, ret->muid, ret->extension, from_kuid(&init_user_ns, ret->n_uid), from_kgid(&init_user_ns, ret->n_gid), from_kuid(&init_user_ns, ret->n_muid)); p9_req_put(clnt, req); return ret; error: kfree(ret); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_stat); struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, u64 request_mask) { int err; struct p9_client *clnt; struct p9_stat_dotl *ret; struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n", fid->fid, request_mask); ret = kmalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return ERR_PTR(-ENOMEM); clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "A", ret); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RGETATTR st_result_mask=%lld\n" "<<< qid=%x.%llx.%x\n" "<<< st_mode=%8.8x st_nlink=%llu\n" "<<< st_uid=%d st_gid=%d\n" "<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n" "<<< st_atime_sec=%lld st_atime_nsec=%lld\n" "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n" "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n" "<<< st_btime_sec=%lld st_btime_nsec=%lld\n" "<<< st_gen=%lld st_data_version=%lld\n", ret->st_result_mask, ret->qid.type, ret->qid.path, ret->qid.version, ret->st_mode, ret->st_nlink, from_kuid(&init_user_ns, ret->st_uid), from_kgid(&init_user_ns, ret->st_gid), ret->st_rdev, ret->st_size, ret->st_blksize, ret->st_blocks, ret->st_atime_sec, ret->st_atime_nsec, ret->st_mtime_sec, ret->st_mtime_nsec, ret->st_ctime_sec, ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec, ret->st_gen, ret->st_data_version); p9_req_put(clnt, req); return ret; error: kfree(ret); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_getattr_dotl); static int p9_client_statsize(struct p9_wstat *wst, int proto_version) { int ret; /* NOTE: size shouldn't include its own length */ /* size[2] type[2] dev[4] qid[13] */ /* mode[4] atime[4] mtime[4] length[8]*/ /* name[s] uid[s] gid[s] muid[s] */ ret = 2 + 4 + 13 + 4 + 4 + 4 + 8 + 2 + 2 + 2 + 2; if (wst->name) ret += strlen(wst->name); if (wst->uid) ret += strlen(wst->uid); if (wst->gid) ret += strlen(wst->gid); if (wst->muid) ret += strlen(wst->muid); if (proto_version == p9_proto_2000u || proto_version == p9_proto_2000L) { /* extension[s] n_uid[4] n_gid[4] n_muid[4] */ ret += 2 + 4 + 4 + 4; if (wst->extension) ret += strlen(wst->extension); } return ret; } int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) { int err = 0; struct p9_req_t *req; struct p9_client *clnt; clnt = fid->clnt; wst->size = p9_client_statsize(wst, clnt->proto_version); p9_debug(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid); p9_debug(P9_DEBUG_9P, " sz=%x type=%x dev=%x qid=%x.%llx.%x\n" " mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n" " name=%s uid=%s gid=%s muid=%s extension=(%s)\n" " uid=%d gid=%d n_muid=%d\n", wst->size, wst->type, wst->dev, wst->qid.type, wst->qid.path, wst->qid.version, wst->mode, wst->atime, wst->mtime, wst->length, wst->name, wst->uid, wst->gid, wst->muid, wst->extension, from_kuid(&init_user_ns, wst->n_uid), from_kgid(&init_user_ns, wst->n_gid), from_kuid(&init_user_ns, wst->n_muid)); req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", fid->fid, wst->size + 2, wst); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid); p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_wstat); int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr) { int err = 0; struct p9_req_t *req; struct p9_client *clnt; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid); p9_debug(P9_DEBUG_9P, " valid=%x mode=%x uid=%d gid=%d size=%lld\n", p9attr->valid, p9attr->mode, from_kuid(&init_user_ns, p9attr->uid), from_kgid(&init_user_ns, p9attr->gid), p9attr->size); p9_debug(P9_DEBUG_9P, " atime_sec=%lld atime_nsec=%lld\n", p9attr->atime_sec, p9attr->atime_nsec); p9_debug(P9_DEBUG_9P, " mtime_sec=%lld mtime_nsec=%lld\n", p9attr->mtime_sec, p9attr->mtime_nsec); req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid); p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_setattr); int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) { int err; struct p9_req_t *req; struct p9_client *clnt; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid); req = p9_client_rpc(clnt, P9_TSTATFS, "d", fid->fid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RSTATFS fid %d type 0x%x bsize %u blocks %llu bfree %llu bavail %llu files %llu ffree %llu fsid %llu namelen %u\n", fid->fid, sb->type, sb->bsize, sb->blocks, sb->bfree, sb->bavail, sb->files, sb->ffree, sb->fsid, sb->namelen); p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_statfs); int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid, const char *name) { int err = 0; struct p9_req_t *req; struct p9_client *clnt; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n", fid->fid, newdirfid->fid, name); req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid, newdirfid->fid, name); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid); p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_rename); int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, struct p9_fid *newdirfid, const char *new_name) { int err = 0; struct p9_req_t *req; struct p9_client *clnt; clnt = olddirfid->clnt; p9_debug(P9_DEBUG_9P, ">>> TRENAMEAT olddirfid %d old name %s newdirfid %d new name %s\n", olddirfid->fid, old_name, newdirfid->fid, new_name); req = p9_client_rpc(clnt, P9_TRENAMEAT, "dsds", olddirfid->fid, old_name, newdirfid->fid, new_name); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n", newdirfid->fid, new_name); p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_renameat); /* An xattrwalk without @attr_name gives the fid for the lisxattr namespace */ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, const char *attr_name, u64 *attr_size) { int err; struct p9_req_t *req; struct p9_client *clnt; struct p9_fid *attr_fid; clnt = file_fid->clnt; attr_fid = p9_fid_create(clnt); if (!attr_fid) { err = -ENOMEM; goto error; } p9_debug(P9_DEBUG_9P, ">>> TXATTRWALK file_fid %d, attr_fid %d name '%s'\n", file_fid->fid, attr_fid->fid, attr_name); req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds", file_fid->fid, attr_fid->fid, attr_name); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "q", attr_size); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); goto clunk_fid; } p9_req_put(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n", attr_fid->fid, *attr_size); return attr_fid; clunk_fid: p9_fid_put(attr_fid); attr_fid = NULL; error: if (attr_fid && attr_fid != file_fid) p9_fid_destroy(attr_fid); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(p9_client_xattrwalk); int p9_client_xattrcreate(struct p9_fid *fid, const char *name, u64 attr_size, int flags) { int err = 0; struct p9_req_t *req; struct p9_client *clnt; p9_debug(P9_DEBUG_9P, ">>> TXATTRCREATE fid %d name %s size %llu flag %d\n", fid->fid, name, attr_size, flags); clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd", fid->fid, name, attr_size, flags); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid); p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL_GPL(p9_client_xattrcreate); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) { int err, non_zc = 0; u32 rsize; struct p9_client *clnt; struct p9_req_t *req; char *dataptr; struct kvec kv = {.iov_base = data, .iov_len = count}; struct iov_iter to; iov_iter_kvec(&to, ITER_DEST, &kv, 1, count); p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %u\n", fid->fid, offset, count); clnt = fid->clnt; rsize = fid->iounit; if (!rsize || rsize > clnt->msize - P9_READDIRHDRSZ) rsize = clnt->msize - P9_READDIRHDRSZ; if (count < rsize) rsize = count; /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && rsize > 1024) { /* response header len is 11 * PDU Header(7) + IO Size (4) */ req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0, 11, "dqd", fid->fid, offset, rsize); } else { non_zc = 1; req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid, offset, rsize); } if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } if (rsize < count) { pr_err("bogus RREADDIR count (%u > %u)\n", count, rsize); err = -EIO; goto free_and_error; } p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %u\n", count); if (non_zc) memmove(data, dataptr, count); p9_req_put(clnt, req); return count; free_and_error: p9_req_put(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_readdir); int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode, dev_t rdev, kgid_t gid, struct p9_qid *qid) { int err; struct p9_client *clnt; struct p9_req_t *req; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d minor %d\n", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev)); req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddg", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev), gid); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type, qid->path, qid->version); error: p9_req_put(clnt, req); return err; } EXPORT_SYMBOL(p9_client_mknod_dotl); int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, kgid_t gid, struct p9_qid *qid) { int err; struct p9_client *clnt; struct p9_req_t *req; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n", fid->fid, name, mode, from_kgid(&init_user_ns, gid)); req = p9_client_rpc(clnt, P9_TMKDIR, "dsdg", fid->fid, name, mode, gid); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type, qid->path, qid->version); error: p9_req_put(clnt, req); return err; } EXPORT_SYMBOL(p9_client_mkdir_dotl); int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status) { int err; struct p9_client *clnt; struct p9_req_t *req; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d start %lld length %lld proc_id %d client_id %s\n", fid->fid, flock->type, flock->flags, flock->start, flock->length, flock->proc_id, flock->client_id); req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type, flock->flags, flock->start, flock->length, flock->proc_id, flock->client_id); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "b", status); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status); error: p9_req_put(clnt, req); return err; } EXPORT_SYMBOL(p9_client_lock_dotl); int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock) { int err; struct p9_client *clnt; struct p9_req_t *req; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld length %lld proc_id %d client_id %s\n", fid->fid, glock->type, glock->start, glock->length, glock->proc_id, glock->client_id); req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid, glock->type, glock->start, glock->length, glock->proc_id, glock->client_id); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "bqqds", &glock->type, &glock->start, &glock->length, &glock->proc_id, &glock->client_id); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld proc_id %d client_id %s\n", glock->type, glock->start, glock->length, glock->proc_id, glock->client_id); error: p9_req_put(clnt, req); return err; } EXPORT_SYMBOL(p9_client_getlock_dotl); int p9_client_readlink(struct p9_fid *fid, char **target) { int err; struct p9_client *clnt; struct p9_req_t *req; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid); req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "s", target); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target); error: p9_req_put(clnt, req); return err; } EXPORT_SYMBOL(p9_client_readlink); int __init p9_client_init(void) { p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU); return p9_req_cache ? 0 : -ENOMEM; } void __exit p9_client_exit(void) { kmem_cache_destroy(p9_req_cache); }
2616 163 2490 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2017 Intel Corporation. All rights reserved. * * This code is based in part on work published here: * * https://github.com/IAIK/KAISER * * The original work was written by and signed off by for the Linux * kernel by: * * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> * * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and * Andy Lutomirsky <luto@amacapital.net> */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/cpu.h> #include <asm/cpufeature.h> #include <asm/hypervisor.h> #include <asm/vsyscall.h> #include <asm/cmdline.h> #include <asm/pti.h> #include <asm/tlbflush.h> #include <asm/desc.h> #include <asm/sections.h> #include <asm/set_memory.h> #include <asm/bugs.h> #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt /* Backporting helper */ #ifndef __GFP_NOTRACK #define __GFP_NOTRACK 0 #endif /* * Define the page-table levels we clone for user-space on 32 * and 64 bit. */ #ifdef CONFIG_X86_64 #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD #else #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE #endif static void __init pti_print_if_insecure(const char *reason) { if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } static void __init pti_print_if_secure(const char *reason) { if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } /* Assume mode is auto unless overridden via cmdline below. */ static enum pti_mode { PTI_AUTO = 0, PTI_FORCE_OFF, PTI_FORCE_ON } pti_mode; void __init pti_check_boottime_disable(void) { if (hypervisor_is_type(X86_HYPER_XEN_PV)) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on XEN PV."); return; } if (pti_mode == PTI_AUTO && !cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL)) pti_mode = PTI_FORCE_OFF; if (pti_mode == PTI_FORCE_OFF) { pti_print_if_insecure("disabled on command line."); return; } if (pti_mode == PTI_FORCE_ON) pti_print_if_secure("force enabled on command line."); if (pti_mode == PTI_AUTO && !boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) return; setup_force_cpu_cap(X86_FEATURE_PTI); if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { pr_debug("PTI enabled, disabling INVLPGB\n"); setup_clear_cpu_cap(X86_FEATURE_INVLPGB); } } static int __init pti_parse_cmdline(char *arg) { if (!strcmp(arg, "off")) pti_mode = PTI_FORCE_OFF; else if (!strcmp(arg, "on")) pti_mode = PTI_FORCE_ON; else if (!strcmp(arg, "auto")) pti_mode = PTI_AUTO; else return -EINVAL; return 0; } early_param("pti", pti_parse_cmdline); static int __init pti_parse_cmdline_nopti(char *arg) { pti_mode = PTI_FORCE_OFF; return 0; } early_param("nopti", pti_parse_cmdline_nopti); pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { /* * Changes to the high (kernel) portion of the kernelmode page * tables are not automatically propagated to the usermode tables. * * Users should keep in mind that, unlike the kernelmode tables, * there is no vmalloc_fault equivalent for the usermode tables. * Top-level entries added to init_mm's usermode pgd after boot * will not be automatically propagated to other mms. */ if (!pgdp_maps_userspace(pgdp) || (pgd.pgd & _PAGE_NOPTISHADOW)) return pgd; /* * The user page tables get the full PGD, accessible from * userspace: */ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; /* * If this is normal user memory, make it NX in the kernel * pagetables so that, if we somehow screw up and return to * usermode with the kernel CR3 loaded, we'll get a page fault * instead of allowing user code to execute with the wrong CR3. * * As exceptions, we don't set NX if: * - _PAGE_USER is not set. This could be an executable * EFI runtime mapping or something similar, and the kernel * may execute from it * - we don't have NX support * - we're clearing the PGD (i.e. the new pgd is not present). */ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && (__supported_pte_mask & _PAGE_NX)) pgd.pgd |= _PAGE_NX; /* return the copy of the PGD we want the kernel to use: */ return pgd; } /* * Walk the user copy of the page tables (optionally) trying to allocate * page table pages on the way down. * * Returns a pointer to a P4D on success, or NULL on failure. */ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) { pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); if (address < PAGE_OFFSET) { WARN_ONCE(1, "attempt to walk user address\n"); return NULL; } if (pgd_none(*pgd)) { unsigned long new_p4d_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_p4d_page)) return NULL; set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } BUILD_BUG_ON(pgd_leaf(*pgd)); return p4d_offset(pgd, address); } /* * Walk the user copy of the page tables (optionally) trying to allocate * page table pages on the way down. * * Returns a pointer to a PMD on success, or NULL on failure. */ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); p4d_t *p4d; pud_t *pud; p4d = pti_user_pagetable_walk_p4d(address); if (!p4d) return NULL; BUILD_BUG_ON(p4d_leaf(*p4d)); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pud_page)) return NULL; set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); } pud = pud_offset(p4d, address); /* The user page tables do not use large mappings: */ if (pud_leaf(*pud)) { WARN_ON(1); return NULL; } if (pud_none(*pud)) { unsigned long new_pmd_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pmd_page)) return NULL; set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); } return pmd_offset(pud, address); } /* * Walk the shadow copy of the page tables (optionally) trying to allocate * page table pages on the way down. Does not support large pages. * * Note: this is only used when mapping *new* kernel data into the * user/shadow page tables. It is never used for userspace data. * * Returns a pointer to a PTE on success, or NULL on failure. */ static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); pmd_t *pmd; pte_t *pte; pmd = pti_user_pagetable_walk_pmd(address); if (!pmd) return NULL; /* Large PMD mapping found */ if (pmd_leaf(*pmd)) { /* Clear the PMD if we hit a large mapping from the first round */ if (late_text) { set_pmd(pmd, __pmd(0)); } else { WARN_ON_ONCE(1); return NULL; } } if (pmd_none(*pmd)) { unsigned long new_pte_page = __get_free_page(gfp); if (!new_pte_page) return NULL; set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); } pte = pte_offset_kernel(pmd, address); if (pte_flags(*pte) & _PAGE_USER) { WARN_ONCE(1, "attempt to walk to user pte\n"); return NULL; } return pte; } #ifdef CONFIG_X86_VSYSCALL_EMULATION static void __init pti_setup_vsyscall(void) { pte_t *pte, *target_pte; unsigned int level; pte = lookup_address(VSYSCALL_ADDR, &level); if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) return; target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false); if (WARN_ON(!target_pte)) return; *target_pte = *pte; set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); } #else static void __init pti_setup_vsyscall(void) { } #endif enum pti_clone_level { PTI_CLONE_PMD, PTI_CLONE_PTE, }; static void pti_clone_pgtable(unsigned long start, unsigned long end, enum pti_clone_level level, bool late_text) { unsigned long addr; /* * Clone the populated PMDs which cover start to end. These PMD areas * can have holes. */ for (addr = start; addr < end;) { pte_t *pte, *target_pte; pmd_t *pmd, *target_pmd; pgd_t *pgd; p4d_t *p4d; pud_t *pud; /* Overflow check */ if (addr < start) break; pgd = pgd_offset_k(addr); if (WARN_ON(pgd_none(*pgd))) return; p4d = p4d_offset(pgd, addr); if (WARN_ON(p4d_none(*p4d))) return; pud = pud_offset(p4d, addr); if (pud_none(*pud)) { WARN_ON_ONCE(addr & ~PUD_MASK); addr = round_up(addr + 1, PUD_SIZE); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { WARN_ON_ONCE(addr & ~PMD_MASK); addr = round_up(addr + 1, PMD_SIZE); continue; } if (pmd_leaf(*pmd) || level == PTI_CLONE_PMD) { target_pmd = pti_user_pagetable_walk_pmd(addr); if (WARN_ON(!target_pmd)) return; /* * Only clone present PMDs. This ensures only setting * _PAGE_GLOBAL on present PMDs. This should only be * called on well-known addresses anyway, so a non- * present PMD would be a surprise. */ if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) return; /* * Setting 'target_pmd' below creates a mapping in both * the user and kernel page tables. It is effectively * global, so set it as global in both copies. Note: * the X86_FEATURE_PGE check is not _required_ because * the CPU ignores _PAGE_GLOBAL when PGE is not * supported. The check keeps consistency with * code that only set this bit when supported. */ if (boot_cpu_has(X86_FEATURE_PGE)) *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); /* * Copy the PMD. That is, the kernelmode and usermode * tables will share the last-level page tables of this * address range */ *target_pmd = *pmd; addr = round_up(addr + 1, PMD_SIZE); } else if (level == PTI_CLONE_PTE) { /* Walk the page-table down to the pte level */ pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { addr = round_up(addr + 1, PAGE_SIZE); continue; } /* Only clone present PTEs */ if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) return; /* Allocate PTE in the user page-table */ target_pte = pti_user_pagetable_walk_pte(addr, late_text); if (WARN_ON(!target_pte)) return; /* Set GLOBAL bit in both PTEs */ if (boot_cpu_has(X86_FEATURE_PGE)) *pte = pte_set_flags(*pte, _PAGE_GLOBAL); /* Clone the PTE */ *target_pte = *pte; addr = round_up(addr + 1, PAGE_SIZE); } else { BUG(); } } } #ifdef CONFIG_X86_64 /* * Clone a single p4d (i.e. a top-level entry on 4-level systems and a * next-level entry on 5-level systems. */ static void __init pti_clone_p4d(unsigned long addr) { p4d_t *kernel_p4d, *user_p4d; pgd_t *kernel_pgd; user_p4d = pti_user_pagetable_walk_p4d(addr); if (!user_p4d) return; kernel_pgd = pgd_offset_k(addr); kernel_p4d = p4d_offset(kernel_pgd, addr); *user_p4d = *kernel_p4d; } /* * Clone the CPU_ENTRY_AREA and associated data into the user space visible * page table. */ static void __init pti_clone_user_shared(void) { unsigned int cpu; pti_clone_p4d(CPU_ENTRY_AREA_BASE); for_each_possible_cpu(cpu) { /* * The SYSCALL64 entry code needs one word of scratch space * in which to spill a register. It lives in the sp2 slot * of the CPU's TSS. * * This is done for all possible CPUs during boot to ensure * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); pte_t *target_pte; target_pte = pti_user_pagetable_walk_pte(va, false); if (WARN_ON(!target_pte)) return; *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL); } } #else /* CONFIG_X86_64 */ /* * On 32 bit PAE systems with 1GB of Kernel address space there is only * one pgd/p4d for the whole kernel. Cloning that would map the whole * address space into the user page-tables, making PTI useless. So clone * the page-table on the PMD level to prevent that. */ static void __init pti_clone_user_shared(void) { unsigned long start, end; start = CPU_ENTRY_AREA_BASE; end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); pti_clone_pgtable(start, end, PTI_CLONE_PMD, false); } #endif /* CONFIG_X86_64 */ /* * Clone the ESPFIX P4D into the user space visible page table */ static void __init pti_setup_espfix64(void) { #ifdef CONFIG_X86_ESPFIX64 pti_clone_p4d(ESPFIX_BASE_ADDR); #endif } /* * Clone the populated PMDs of the entry text and force it RO. */ static void pti_clone_entry_text(bool late) { pti_clone_pgtable((unsigned long) __entry_text_start, (unsigned long) __entry_text_end, PTI_LEVEL_KERNEL_IMAGE, late); } /* * Global pages and PCIDs are both ways to make kernel TLB entries * live longer, reduce TLB misses and improve kernel performance. * But, leaving all kernel text Global makes it potentially accessible * to Meltdown-style attacks which make it trivial to find gadgets or * defeat KASLR. * * Only use global pages when it is really worth it. */ static inline bool pti_kernel_image_global_ok(void) { /* * Systems with PCIDs get little benefit from global * kernel text and are not worth the downsides. */ if (cpu_feature_enabled(X86_FEATURE_PCID)) return false; /* * Only do global kernel image for pti=auto. Do the most * secure thing (not global) if pti=on specified. */ if (pti_mode != PTI_AUTO) return false; /* * K8 may not tolerate the cleared _PAGE_RW on the userspace * global kernel image pages. Do the safe thing (disable * global kernel image). This is unlikely to ever be * noticed because PTI is disabled by default on AMD CPUs. */ if (boot_cpu_has(X86_FEATURE_K8)) return false; /* * RANDSTRUCT derives its hardening benefits from the * attacker's lack of knowledge about the layout of kernel * data structures. Keep the kernel image non-global in * cases where RANDSTRUCT is in use to help keep the layout a * secret. */ if (IS_ENABLED(CONFIG_RANDSTRUCT)) return false; return true; } /* * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. */ static void pti_clone_kernel_text(void) { /* * rodata is part of the kernel image and is normally * readable on the filesystem or on the web. But, do not * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); unsigned long end_clone = (unsigned long)__end_rodata_aligned; unsigned long end_global = PFN_ALIGN((unsigned long)_etext); if (!pti_kernel_image_global_ok()) return; pr_debug("mapping partial kernel image into user address space\n"); /* * Note that this will undo _some_ of the work that * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false); /* * pti_clone_pgtable() will set the global bit in any PMDs * that it clones, but we also need to get any PTEs in * the last level for areas that are not huge-page-aligned. */ /* Set the global bit for normal non-__init kernel text: */ set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } static void pti_set_kernel_image_nonglobal(void) { /* * The identity map is created with PMDs, regardless of the * actual length of the kernel. We need to clear * _PAGE_GLOBAL up to a PMD boundary, not just to the end * of the image. */ unsigned long start = PFN_ALIGN(_text); unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE); /* * This clears _PAGE_GLOBAL from the entire kernel image. * pti_clone_kernel_text() map put _PAGE_GLOBAL back for * areas that are mapped to userspace. */ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } /* * Initialize kernel page table isolation */ void __init pti_init(void) { if (!boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("enabled\n"); #ifdef CONFIG_X86_32 /* * We check for X86_FEATURE_PCID here. But the init-code will * clear the feature flag on 32 bit because the feature is not * supported on 32 bit anyway. To print the warning we need to * check with cpuid directly again. */ if (cpuid_ecx(0x1) & BIT(17)) { /* Use printk to work around pr_fmt() */ printk(KERN_WARNING "\n"); printk(KERN_WARNING "************************************************************\n"); printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); printk(KERN_WARNING "** **\n"); printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); printk(KERN_WARNING "** **\n"); printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); printk(KERN_WARNING "************************************************************\n"); } #endif pti_clone_user_shared(); /* Undo all global bits from the init pagetables in head_64.S: */ pti_set_kernel_image_nonglobal(); /* Replace some of the global bits just for shared entry text: */ /* * This is very early in boot. Device and Late initcalls can do * modprobe before free_initmem() and mark_readonly(). This * pti_clone_entry_text() allows those user-mode-helpers to function, * but notably the text is still RW. */ pti_clone_entry_text(false); pti_setup_espfix64(); pti_setup_vsyscall(); } /* * Finalize the kernel mappings in the userspace page-table. Some of the * mappings for the kernel image might have changed since pti_init() * cloned them. This is because parts of the kernel image have been * mapped RO and/or NX. These changes need to be cloned again to the * userspace page-table. */ void pti_finalize(void) { if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* * This is after free_initmem() (all initcalls are done) and we've done * mark_readonly(). Text is now NX which might've split some PMDs * relative to the early clone. */ pti_clone_entry_text(true); pti_clone_kernel_text(); debug_checkwx_user(); }
11 10 12 9 12 12 719 54 742 742 742 423 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 /* SPDX-License-Identifier: GPL-2.0 * * page_pool/helpers.h * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ /** * DOC: page_pool allocator * * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * * If the driver knows that it always requires full pages or its allocations are * always smaller than half a page, it can use one of the more specific API * calls: * * 1. page_pool_alloc_pages(): allocate memory without page splitting when * driver knows that the memory it need is always bigger than half of the page * allocated from page pool. There is no cache line dirtying for 'struct page' * when a page is recycled back to the page pool. * * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver * knows that the memory it need is always smaller than or equal to half of the * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * * page_pool_put_page() may be called multiple times on the same page if a page * is split into multiple fragments. For the last fragment, it will either * recycle the page, or in case of page->_refcount > 1, it will release the DMA * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H #include <linux/dma-mapping.h> #include <net/page_pool/types.h> #include <net/net_debug.h> #include <net/netmem.h> #ifdef CONFIG_PAGE_POOL_STATS /* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats); bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) { return 0; } static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) { return data; } static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { return data; } #endif /** * page_pool_dev_alloc_pages() - allocate a page. * @pool: pool from which to allocate * * Get a page from the page allocator or page_pool caches. */ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_pages(pool, gfp); } /** * page_pool_dev_alloc_frag() - allocate a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: requested size * * Get a page fragment from the page allocator or page_pool caches. * * Return: allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_frag(pool, offset, size, gfp); } static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; netmem_ref netmem; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; return page_pool_alloc_netmems(pool, gfp); } netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); if (unlikely(!netmem)) return 0; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize * underestimate problem. */ if (pool->frag_offset + *size > max_size) { *size = max_size - *offset; pool->frag_offset = max_size; } return netmem; } static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; return page_pool_alloc_netmem(pool, offset, size, gfp); } static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool) { gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; return page_pool_alloc_netmems(pool, gfp); } static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); } /** * page_pool_dev_alloc() - allocate a page or a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: in as the requested size, out as the allocated size * * Get a page or a page fragment from the page allocator or page_pool caches * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * * Return: allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc(pool, offset, size, gfp); } static inline void *page_pool_alloc_va(struct page_pool *pool, unsigned int *size, gfp_t gfp) { unsigned int offset; struct page *page; /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); if (unlikely(!page)) return NULL; return page_address(page) + offset; } /** * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its * va. * @pool: pool from which to allocate * @size: in as the requested size, out as the allocated size * * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * * Return: the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_va(pool, size, gfp); } /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated * * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ static inline enum dma_data_direction page_pool_get_dma_dir(const struct page_pool *pool) { return pool->p.dma_dir; } static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr) { atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr); } /** * page_pool_fragment_page() - split a fresh page into fragments * @page: page to split * @nr: references to set * * pp_ref_count represents the number of outstanding references to the page, * which will be freed using page_pool APIs (rather than page allocator APIs * like put_page()). Such references are usually held by page_pool-aware * objects like skbs marked for page pool recycling. * * This helper allows the caller to take (set) multiple references to a * freshly allocated page. The page must be freshly allocated (have a * pp_ref_count of 1). This is commonly done by drivers and * "fragment allocators" to save atomic operations - either when they know * upfront how many references they will need; or to take MAX references and * return the unused ones with a single atomic dec(), instead of performing * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { page_pool_fragment_netmem(page_to_netmem(page), nr); } static inline long page_pool_unref_netmem(netmem_ref netmem, long nr) { atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); long ret; /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ if (atomic_long_read(pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) atomic_long_set(pp_ref_count, 1); return 0; } ret = atomic_long_sub_return(nr, pp_ref_count); WARN_ON(ret < 0); /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call * page_pool_unref_page() currently. */ if (unlikely(!ret)) atomic_long_set(pp_ref_count, 1); return ret; } static inline long page_pool_unref_page(struct page *page, long nr) { return page_pool_unref_netmem(page_to_netmem(page), nr); } static inline void page_pool_ref_netmem(netmem_ref netmem) { atomic_long_inc(netmem_get_pp_ref_count_ref(netmem)); } static inline void page_pool_ref_page(struct page *page) { page_pool_ref_netmem(page_to_netmem(page)); } static inline bool page_pool_unref_and_test(netmem_ref netmem) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_netmem(netmem, 1) == 0; } static inline void page_pool_put_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL if (!page_pool_unref_and_test(netmem)) return; page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); #endif } /** * page_pool_put_page() - release a reference to a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @dma_sync_size: how much of the page may have been touched by the device * @allow_direct: released by the consumer, allow lockless caching * * The outcome of this depends on the page refcnt. If the driver bumps * the refcnt > 1 this will unmap the page. If the page refcnt is 1 * the allocator owns the page and will try to recycle it in one of the pool * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device * using dma_sync_single_range_for_device(). */ static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), dma_sync_size, allow_direct); } static inline void page_pool_put_full_netmem(struct page_pool *pool, netmem_ref netmem, bool allow_direct) { page_pool_put_netmem(pool, netmem, -1, allow_direct); } /** * page_pool_put_full_page() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @allow_direct: released by the consumer, allow lockless caching * * Similar to page_pool_put_page(), but will DMA sync the entire memory area * as configured in &page_pool_params.max_len. */ static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), -1, allow_direct); } /** * page_pool_recycle_direct() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * * Similar to page_pool_put_full_page() but caller must guarantee safe context * (e.g NAPI), since it will recycle the page directly into the pool fast cache. */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { page_pool_put_full_page(pool, page, true); } static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, netmem_ref netmem) { page_pool_put_full_netmem(pool, netmem, true); } #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) /** * page_pool_free_va() - free a va into the page_pool * @pool: pool from which va was allocated * @va: va to be freed * @allow_direct: freed by the consumer, allow lockless caching * * Free a va allocated from page_pool_allo_va(). */ static inline void page_pool_free_va(struct page_pool *pool, void *va, bool allow_direct) { page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); } static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) { dma_addr_t ret = netmem_get_dma_addr(netmem); if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) ret <<= PAGE_SHIFT; return ret; } /** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { return page_pool_get_dma_addr_netmem(page_to_netmem(page)); } static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, const dma_addr_t dma_addr, u32 offset, u32 dma_sync_size) { dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, offset + pool->p.offset, dma_sync_size, page_pool_get_dma_dir(pool)); } /** * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW * @pool: &page_pool the @page belongs to * @page: page to sync * @offset: offset from page start to "hard" start if using PP frags * @dma_sync_size: size of the data written to the page * * Can be used as a shorthand to sync Rx pages before accessing them in the * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``. * Note that this version performs DMA sync unconditionally, even if the * associated PP doesn't perform sync-for-device. */ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, dma_sync_size); } static inline void page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, const netmem_ref netmem, u32 offset, u32 dma_sync_size) { if (!pool->dma_sync_for_cpu) return; __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr_netmem(netmem), offset, dma_sync_size); } static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); } static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) { if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } static inline bool page_pool_is_unreadable(struct page_pool *pool) { return !!pool->mp_ops; } #endif /* _NET_PAGE_POOL_HELPERS_H */
2 39 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* * net/tipc/bcast.h: Include file for TIPC broadcast code * * Copyright (c) 2003-2006, 2014-2015, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_BCAST_H #define _TIPC_BCAST_H #include "core.h" struct tipc_node; struct tipc_msg; struct tipc_nl_msg; struct tipc_nlist; struct tipc_nitem; extern const char tipc_bclink_name[]; extern unsigned long sysctl_tipc_bc_retruni; #define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000) #define BCLINK_MODE_BCAST 0x1 #define BCLINK_MODE_RCAST 0x2 #define BCLINK_MODE_SEL 0x4 struct tipc_nlist { struct list_head list; u32 self; u16 remote; bool local; }; void tipc_nlist_init(struct tipc_nlist *nl, u32 self); void tipc_nlist_purge(struct tipc_nlist *nl); void tipc_nlist_add(struct tipc_nlist *nl, u32 node); void tipc_nlist_del(struct tipc_nlist *nl, u32 node); /* Cookie to be used between socket and broadcast layer * @rcast: replicast (instead of broadcast) was used at previous xmit * @mandatory: broadcast/replicast indication was set by user * @deferredq: defer queue to make message in order * @expires: re-evaluate non-mandatory transmit method if we are past this */ struct tipc_mc_method { bool rcast; bool mandatory; struct sk_buff_head deferredq; unsigned long expires; }; int tipc_bcast_init(struct net *net); void tipc_bcast_stop(struct net *net); void tipc_bcast_add_peer(struct net *net, struct tipc_link *l, struct sk_buff_head *xmitq); void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); int tipc_bcast_get_mtu(struct net *net); void tipc_bcast_toggle_rcast(struct net *net, bool supp); int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt); int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, u16 *cong_link_cnt); int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr); int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *retrq); int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *bcl); int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l); u32 tipc_bcast_get_mode(struct net *net); u32 tipc_bcast_get_broadcast_ratio(struct net *net); void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, struct sk_buff_head *inputq); static inline void tipc_bcast_lock(struct net *net) { spin_lock_bh(&tipc_net(net)->bclock); } static inline void tipc_bcast_unlock(struct net *net) { spin_unlock_bh(&tipc_net(net)->bclock); } static inline struct tipc_link *tipc_bc_sndlink(struct net *net) { return tipc_net(net)->bcl; } #endif
5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Stream Parser * * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #ifndef __NET_STRPARSER_H_ #define __NET_STRPARSER_H_ #include <linux/skbuff.h> #include <net/sock.h> #define STRP_STATS_ADD(stat, count) ((stat) += (count)) #define STRP_STATS_INCR(stat) ((stat)++) struct strp_stats { unsigned long long msgs; unsigned long long bytes; unsigned int mem_fail; unsigned int need_more_hdr; unsigned int msg_too_big; unsigned int msg_timeouts; unsigned int bad_hdr_len; }; struct strp_aggr_stats { unsigned long long msgs; unsigned long long bytes; unsigned int mem_fail; unsigned int need_more_hdr; unsigned int msg_too_big; unsigned int msg_timeouts; unsigned int bad_hdr_len; unsigned int aborts; unsigned int interrupted; unsigned int unrecov_intr; }; struct strparser; /* Callbacks are called with lock held for the attached socket */ struct strp_callbacks { int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); int (*read_sock)(struct strparser *strp, read_descriptor_t *desc, sk_read_actor_t recv_actor); int (*read_sock_done)(struct strparser *strp, int err); void (*abort_parser)(struct strparser *strp, int err); void (*lock)(struct strparser *strp); void (*unlock)(struct strparser *strp); }; struct strp_msg { int full_len; int offset; }; struct _strp_msg { /* Internal cb structure. struct strp_msg must be first for passing * to upper layer. */ struct strp_msg strp; int accum_len; }; struct sk_skb_cb { #define SK_SKB_CB_PRIV_LEN 20 unsigned char data[SK_SKB_CB_PRIV_LEN]; /* align strp on cache line boundary within skb->cb[] */ unsigned char pad[4]; struct _strp_msg strp; /* strp users' data follows */ struct tls_msg { u8 control; } tls; /* temp_reg is a temporary register used for bpf_convert_data_end_access * when dst_reg == src_reg. */ u64 temp_reg; }; static inline struct strp_msg *strp_msg(struct sk_buff *skb) { return (struct strp_msg *)((void *)skb->cb + offsetof(struct sk_skb_cb, strp)); } /* Structure for an attached lower socket */ struct strparser { struct sock *sk; u32 stopped : 1; u32 paused : 1; u32 aborted : 1; u32 interrupted : 1; u32 unrecov_intr : 1; struct sk_buff **skb_nextp; struct sk_buff *skb_head; unsigned int need_bytes; struct delayed_work msg_timer_work; struct work_struct work; struct strp_stats stats; struct strp_callbacks cb; }; /* Must be called with lock held for attached socket */ static inline void strp_pause(struct strparser *strp) { strp->paused = 1; } /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) { /* Save psock statistics in the mux when psock is being unattached. */ #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += \ strp->stats._stat) SAVE_PSOCK_STATS(msgs); SAVE_PSOCK_STATS(bytes); SAVE_PSOCK_STATS(mem_fail); SAVE_PSOCK_STATS(need_more_hdr); SAVE_PSOCK_STATS(msg_too_big); SAVE_PSOCK_STATS(msg_timeouts); SAVE_PSOCK_STATS(bad_hdr_len); #undef SAVE_PSOCK_STATS if (strp->aborted) agg_stats->aborts++; if (strp->interrupted) agg_stats->interrupted++; if (strp->unrecov_intr) agg_stats->unrecov_intr++; } static inline void aggregate_strp_stats(struct strp_aggr_stats *stats, struct strp_aggr_stats *agg_stats) { #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat) SAVE_PSOCK_STATS(msgs); SAVE_PSOCK_STATS(bytes); SAVE_PSOCK_STATS(mem_fail); SAVE_PSOCK_STATS(need_more_hdr); SAVE_PSOCK_STATS(msg_too_big); SAVE_PSOCK_STATS(msg_timeouts); SAVE_PSOCK_STATS(bad_hdr_len); SAVE_PSOCK_STATS(aborts); SAVE_PSOCK_STATS(interrupted); SAVE_PSOCK_STATS(unrecov_intr); #undef SAVE_PSOCK_STATS } void strp_done(struct strparser *strp); void strp_stop(struct strparser *strp); void strp_check_rcv(struct strparser *strp); int strp_init(struct strparser *strp, struct sock *sk, const struct strp_callbacks *cb); void strp_data_ready(struct strparser *strp); int strp_process(struct strparser *strp, struct sk_buff *orig_skb, unsigned int orig_offset, size_t orig_len, size_t max_msg_size, long timeo); #endif /* __NET_STRPARSER_H_ */
908 907 881 99 6 907 6 34 34 34 1460 792 641 641 794 793 987 987 234 852 986 11 2 4 10 10 882 880 156 811 135 135 15 15 854 851 27 4 2 2 2 17 10 7 10 17 831 834 832 66 908 151 828 151 24 129 226 215 33 6 220 834 195 909 21 3 20 915 916 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2003 * Copyright (c) Cisco 1999,2000 * Copyright (c) Motorola 1999,2000,2001 * Copyright (c) La Monte H.P. Yarroll 2001 * * This file is part of the SCTP kernel implementation. * * A collection class to handle the storage of transport addresses. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/if_inet6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags); static void sctp_bind_addr_clean(struct sctp_bind_addr *); /* First Level Abstractions. */ /* Copy 'src' to 'dest' taking 'scope' into account. Omit addresses * in 'src' which have a broader scope than 'scope'. */ int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, enum sctp_scope scope, gfp_t gfp, int flags) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; /* Extract the addresses which are relevant for this scope. */ list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, scope, gfp, flags); if (error < 0) goto out; } /* If there are no addresses matching the scope and * this is global scope, try to get a link scope address, with * the assumption that we must be sitting behind a NAT. */ if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) { list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, SCTP_SCOPE_LINK, gfp, flags); if (error < 0) goto out; } } /* If somehow no addresses were found that can be used with this * scope, it's an error. */ if (list_empty(&dest->address_list)) error = -ENETUNREACH; out: if (error) sctp_bind_addr_clean(dest); return error; } /* Exactly duplicate the address lists. This is necessary when doing * peer-offs and accepts. We don't want to put all the current system * addresses into the endpoint. That's useless. But we do want duplicat * the list of bound addresses that the older endpoint used. */ int sctp_bind_addr_dup(struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, gfp_t gfp) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; list_for_each_entry(addr, &src->address_list, list) { error = sctp_add_bind_addr(dest, &addr->a, sizeof(addr->a), 1, gfp); if (error < 0) break; } return error; } /* Initialize the SCTP_bind_addr structure for either an endpoint or * an association. */ void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port) { INIT_LIST_HEAD(&bp->address_list); bp->port = port; } /* Dispose of the address list. */ static void sctp_bind_addr_clean(struct sctp_bind_addr *bp) { struct sctp_sockaddr_entry *addr, *temp; /* Empty the bind address list. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { list_del_rcu(&addr->list); kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); } } /* Dispose of an SCTP_bind_addr structure */ void sctp_bind_addr_free(struct sctp_bind_addr *bp) { /* Empty the bind address list. */ sctp_bind_addr_clean(bp); } /* Add an address to the bind address list in the SCTP_bind_addr structure. */ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, int new_size, __u8 addr_state, gfp_t gfp) { struct sctp_sockaddr_entry *addr; /* Add the address to the bind address list. */ addr = kzalloc(sizeof(*addr), gfp); if (!addr) return -ENOMEM; memcpy(&addr->a, new, min_t(size_t, sizeof(*new), new_size)); /* Fix up the port if it has not yet been set. * Both v4 and v6 have the port at the same offset. */ if (!addr->a.v4.sin_port) addr->a.v4.sin_port = htons(bp->port); addr->state = addr_state; addr->valid = 1; INIT_LIST_HEAD(&addr->list); /* We always hold a socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_add_tail_rcu(&addr->list, &bp->address_list); SCTP_DBG_OBJCNT_INC(addr); return 0; } /* Delete an address from the bind address list in the SCTP_bind_addr * structure. */ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr) { struct sctp_sockaddr_entry *addr, *temp; int found = 0; /* We hold the socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { if (sctp_cmp_addr_exact(&addr->a, del_addr)) { /* Found the exact match. */ found = 1; addr->valid = 0; list_del_rcu(&addr->list); break; } } if (found) { kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); return 0; } return -EINVAL; } /* Create a network byte-order representation of all the addresses * formated as SCTP parameters. * * The second argument is the return value for the length. */ union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, int *addrs_len, gfp_t gfp) { union sctp_params addrparms; union sctp_params retval; int addrparms_len; union sctp_addr_param rawaddr; int len; struct sctp_sockaddr_entry *addr; struct list_head *pos; struct sctp_af *af; addrparms_len = 0; len = 0; /* Allocate enough memory at once. */ list_for_each(pos, &bp->address_list) { len += sizeof(union sctp_addr_param); } /* Don't even bother embedding an address if there * is only one. */ if (len == sizeof(union sctp_addr_param)) { retval.v = NULL; goto end_raw; } retval.v = kmalloc(len, gfp); if (!retval.v) goto end_raw; addrparms = retval; list_for_each_entry(addr, &bp->address_list, list) { af = sctp_get_af_specific(addr->a.v4.sin_family); len = af->to_addr_param(&addr->a, &rawaddr); memcpy(addrparms.v, &rawaddr, len); addrparms.v += len; addrparms_len += len; } end_raw: *addrs_len = addrparms_len; return retval; } /* * Create an address list out of the raw address list format (IPv4 and IPv6 * address parameters). */ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, int addrs_len, __u16 port, gfp_t gfp) { union sctp_addr_param *rawaddr; struct sctp_paramhdr *param; union sctp_addr addr; int retval = 0; int len; struct sctp_af *af; /* Convert the raw address to standard address format */ while (addrs_len) { param = (struct sctp_paramhdr *)raw_addr_list; rawaddr = (union sctp_addr_param *)raw_addr_list; af = sctp_get_af_specific(param_type2af(param->type)); if (unlikely(!af) || !af->from_addr_param(&addr, rawaddr, htons(port), 0)) { retval = -EINVAL; goto out_err; } if (sctp_bind_addr_state(bp, &addr) != -1) goto next; retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), SCTP_ADDR_SRC, gfp); if (retval) /* Can't finish building the list, clean up. */ goto out_err; next: len = ntohs(param->length); addrs_len -= len; raw_addr_list += len; } return retval; out_err: if (retval) sctp_bind_addr_clean(bp); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Does this contain a specified address? Allow wildcarding. */ int sctp_bind_addr_match(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; int match = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) { match = 1; break; } } rcu_read_unlock(); return match; } int sctp_bind_addrs_check(struct sctp_sock *sp, struct sctp_sock *sp2, int cnt2) { struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr; struct sctp_bind_addr *bp = &sp->ep->base.bind_addr; struct sctp_sockaddr_entry *laddr, *laddr2; bool exist = false; int cnt = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { list_for_each_entry_rcu(laddr2, &bp2->address_list, list) { if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) && laddr->valid && laddr2->valid) { exist = true; goto next; } } cnt = 0; break; next: cnt++; } rcu_read_unlock(); return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1); } /* Does the address 'addr' conflict with any addresses in * the bp. */ int sctp_bind_addr_conflict(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *bp_sp, struct sctp_sock *addr_sp) { struct sctp_sockaddr_entry *laddr; int conflict = 0; struct sctp_sock *sp; /* Pick the IPv6 socket as the basis of comparison * since it's usually a superset of the IPv4. * If there is no IPv6 socket, then default to bind_addr. */ if (sctp_opt2sk(bp_sp)->sk_family == AF_INET6) sp = bp_sp; else if (sctp_opt2sk(addr_sp)->sk_family == AF_INET6) sp = addr_sp; else sp = bp_sp; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; conflict = sp->pf->cmp_addr(&laddr->a, addr, sp); if (conflict) break; } rcu_read_unlock(); return conflict; } /* Get the state of the entry in the bind_addr_list */ int sctp_bind_addr_state(const struct sctp_bind_addr *bp, const union sctp_addr *addr) { struct sctp_sockaddr_entry *laddr; struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (unlikely(!af)) return -1; list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (af->cmp_addr(&laddr->a, addr)) return laddr->state; } return -1; } /* Find the first address in the bind address list that is not present in * the addrs packed array. */ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, const union sctp_addr *addrs, int addrcnt, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; union sctp_addr *addr; void *addr_buf; struct sctp_af *af; int i; /* This is only called sctp_send_asconf_del_ip() and we hold * the socket lock in that code patch, so that address list * can't change. */ list_for_each_entry(laddr, &bp->address_list, list) { addr_buf = (union sctp_addr *)addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); if (!af) break; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) break; addr_buf += af->sockaddr_len; } if (i == addrcnt) return &laddr->a; } return NULL; } /* Copy out addresses from the global local address list. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags) { int error = 0; if (sctp_is_any(NULL, addr)) { error = sctp_copy_local_addr_list(net, dest, scope, gfp, flags); } else if (sctp_in_scope(net, addr, scope)) { /* Now that the address is in scope, check to see if * the address type is supported by local sock as * well as the remote peer. */ if ((((AF_INET == addr->sa.sa_family) && (flags & SCTP_ADDR4_ALLOWED) && (flags & SCTP_ADDR4_PEERSUPP))) || (((AF_INET6 == addr->sa.sa_family) && (flags & SCTP_ADDR6_ALLOWED) && (flags & SCTP_ADDR6_PEERSUPP)))) error = sctp_add_bind_addr(dest, addr, sizeof(*addr), SCTP_ADDR_SRC, gfp); } return error; } /* Is this a wildcard address? */ int sctp_is_any(struct sock *sk, const union sctp_addr *addr) { unsigned short fam = 0; struct sctp_af *af; /* Try to get the right address family */ if (addr->sa.sa_family != AF_UNSPEC) fam = addr->sa.sa_family; else if (sk) fam = sk->sk_family; af = sctp_get_af_specific(fam); if (!af) return 0; return af->is_any(addr); } /* Is 'addr' valid for 'scope'? */ int sctp_in_scope(struct net *net, const union sctp_addr *addr, enum sctp_scope scope) { enum sctp_scope addr_scope = sctp_scope(addr); /* The unusable SCTP addresses will not be considered with * any defined scopes. */ if (SCTP_SCOPE_UNUSABLE == addr_scope) return 0; /* * For INIT and INIT-ACK address list, let L be the level of * requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * * Address scoping can be selectively controlled via sysctl * option */ switch (net->sctp.scope_policy) { case SCTP_SCOPE_POLICY_DISABLE: return 1; case SCTP_SCOPE_POLICY_ENABLE: if (addr_scope <= scope) return 1; break; case SCTP_SCOPE_POLICY_PRIVATE: if (addr_scope <= scope || SCTP_SCOPE_PRIVATE == addr_scope) return 1; break; case SCTP_SCOPE_POLICY_LINK: if (addr_scope <= scope || SCTP_SCOPE_LINK == addr_scope) return 1; break; default: break; } return 0; } int sctp_is_ep_boundall(struct sock *sk) { struct sctp_bind_addr *bp; struct sctp_sockaddr_entry *addr; bp = &sctp_sk(sk)->ep->base.bind_addr; if (sctp_list_single_entry(&bp->address_list)) { addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(sk, &addr->a)) return 1; } return 0; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ /* What is the scope of 'addr'? */ enum sctp_scope sctp_scope(const union sctp_addr *addr) { struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (!af) return SCTP_SCOPE_UNUSABLE; return af->scope((union sctp_addr *)addr); }
6 6 7 6 2 3 2 4 4 4 3 1 3 3 4 4 4 4 4 4 6 6 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Destination Hashing scheduling module * * Authors: Wensong Zhang <wensong@gnuchina.org> * * Inspired by the consistent hashing scheduler patch from * Thomas Proell <proellt@gmx.de> * * Changes: */ /* * The dh algorithm is to select server by the hash key of destination IP * address. The pseudo code is as follows: * * n <- servernode[dest_ip]; * if (n is dead) OR * (n is overloaded) OR (n.weight <= 0) then * return NULL; * * return n; * * Notes that servernode is a 256-bucket hash table that maps the hash * index derived from packet destination IP address to the current server * array. If the dh scheduler is used in cache cluster, it is good to * combine it with cache_bypass feature. When the statically assigned * server is dead or overloaded, the load balancer can bypass the cache * server and send requests to the original server directly. * */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/ip.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/hash.h> #include <net/ip_vs.h> /* * IPVS DH bucket */ struct ip_vs_dh_bucket { struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* * for IPVS DH entry hash table */ #ifndef CONFIG_IP_VS_DH_TAB_BITS #define CONFIG_IP_VS_DH_TAB_BITS 8 #endif #define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) struct ip_vs_dh_state { struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; struct rcu_head rcu_head; }; /* * Returns hash value for IPVS DH entry */ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS); } /* * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) { return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); } /* * Assign all the hash buckets of the specified table with the service. */ static int ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_dh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; bool empty; b = &s->buckets[0]; p = &svc->destinations; empty = list_empty(p); for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) ip_vs_dest_put(dest); if (empty) RCU_INIT_POINTER(b->dest, NULL); else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); ip_vs_dest_hold(dest); RCU_INIT_POINTER(b->dest, dest); p = p->next; } b++; } return 0; } /* * Flush all the hash buckets of the specified table. */ static void ip_vs_dh_flush(struct ip_vs_dh_state *s) { int i; struct ip_vs_dh_bucket *b; struct ip_vs_dest *dest; b = &s->buckets[0]; for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) { ip_vs_dest_put(dest); RCU_INIT_POINTER(b->dest, NULL); } b++; } } static int ip_vs_dh_init_svc(struct ip_vs_service *svc) { struct ip_vs_dh_state *s; /* allocate the DH table for this service */ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); if (s == NULL) return -ENOMEM; svc->sched_data = s; IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); /* assign the hash buckets with current dests */ ip_vs_dh_reassign(s, svc); return 0; } static void ip_vs_dh_done_svc(struct ip_vs_service *svc) { struct ip_vs_dh_state *s = svc->sched_data; /* got to clean up hash buckets here */ ip_vs_dh_flush(s); /* release the table itself */ kfree_rcu(s, rcu_head); IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); } static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, struct ip_vs_dest *dest) { struct ip_vs_dh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ ip_vs_dh_reassign(s, svc); return 0; } /* * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, * consider that the server is overloaded here. */ static inline int is_overloaded(struct ip_vs_dest *dest) { return dest->flags & IP_VS_DEST_F_OVERLOAD; } /* * Destination hashing scheduling */ static struct ip_vs_dest * ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; struct ip_vs_dh_state *s; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); s = (struct ip_vs_dh_state *) svc->sched_data; dest = ip_vs_dh_get(svc->af, s, &iph->daddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest)) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } /* * IPVS DH Scheduler structure */ static struct ip_vs_scheduler ip_vs_dh_scheduler = { .name = "dh", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, .add_dest = ip_vs_dh_dest_changed, .del_dest = ip_vs_dh_dest_changed, .schedule = ip_vs_dh_schedule, }; static int __init ip_vs_dh_init(void) { return register_ip_vs_scheduler(&ip_vs_dh_scheduler); } static void __exit ip_vs_dh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); synchronize_rcu(); } module_init(ip_vs_dh_init); module_exit(ip_vs_dh_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs destination hashing scheduler");
77 1 1 123 39 41 39 5 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix * Copyright (C) 2006 Andrey Volkov, Varma Electronics * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com> */ #include <linux/can/dev.h> #include <linux/module.h> #define MOD_DESC "CAN device driver interface" MODULE_DESCRIPTION(MOD_DESC); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>"); /* Local echo of CAN messages * * CAN network devices *should* support a local echo functionality * (see Documentation/networking/can.rst). To test the handling of CAN * interfaces that do not support the local echo both driver types are * implemented. In the case that the driver does not support the echo * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core * to perform the echo as a fallback solution. */ void can_flush_echo_skb(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; int i; for (i = 0; i < priv->echo_skb_max; i++) { if (priv->echo_skb[i]) { kfree_skb(priv->echo_skb[i]); priv->echo_skb[i] = NULL; stats->tx_dropped++; stats->tx_aborted_errors++; } } } /* Put the skb on the stack to be looped backed locally lateron * * The function is typically called in the start_xmit function * of the device driver. The driver must protect access to * priv->echo_skb, if necessary. */ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return -EINVAL; } /* check flag whether this packet has to be looped back */ if (!(dev->flags & IFF_ECHO) || (skb->protocol != htons(ETH_P_CAN) && skb->protocol != htons(ETH_P_CANFD) && skb->protocol != htons(ETH_P_CANXL))) { kfree_skb(skb); return 0; } if (!priv->echo_skb[idx]) { skb = can_create_echo_skb(skb); if (!skb) return -ENOMEM; /* make settings for echo to reduce code in irq context */ skb->ip_summed = CHECKSUM_UNNECESSARY; skb->dev = dev; /* save frame_len to reuse it when transmission is completed */ can_skb_prv(skb)->frame_len = frame_len; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; skb_tx_timestamp(skb); /* save this skb for tx interrupt echo handling */ priv->echo_skb[idx] = skb; } else { /* locking problem with netif_stop_queue() ?? */ netdev_err(dev, "%s: BUG! echo_skb %d is occupied!\n", __func__, idx); kfree_skb(skb); return -EBUSY; } return 0; } EXPORT_SYMBOL_GPL(can_put_echo_skb); struct sk_buff * __can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *len_ptr, unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return NULL; } if (priv->echo_skb[idx]) { /* Using "struct canfd_frame::len" for the frame * length is supported on both CAN and CANFD frames. */ struct sk_buff *skb = priv->echo_skb[idx]; struct can_skb_priv *can_skb_priv = can_skb_prv(skb); if (skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS) skb_tstamp_tx(skb, skb_hwtstamps(skb)); /* get the real payload length for netdev statistics */ *len_ptr = can_skb_get_data_len(skb); if (frame_len_ptr) *frame_len_ptr = can_skb_priv->frame_len; priv->echo_skb[idx] = NULL; if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_BROADCAST; } else { dev_consume_skb_any(skb); return NULL; } return skb; } return NULL; } /* Get the skb from the stack and loop it back locally * * The function is typically called when the TX done interrupt * is handled in the device driver. The driver must protect * access to priv->echo_skb, if necessary. */ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr) { struct sk_buff *skb; unsigned int len; skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr); if (!skb) return 0; skb_get(skb); if (netif_rx(skb) == NET_RX_SUCCESS) dev_consume_skb_any(skb); else dev_kfree_skb_any(skb); return len; } EXPORT_SYMBOL_GPL(can_get_echo_skb); /* Remove the skb from the stack and free it. * * The function is typically called when TX failed. */ void can_free_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return; } if (priv->echo_skb[idx]) { struct sk_buff *skb = priv->echo_skb[idx]; struct can_skb_priv *can_skb_priv = can_skb_prv(skb); if (frame_len_ptr) *frame_len_ptr = can_skb_priv->frame_len; dev_kfree_skb_any(skb); priv->echo_skb[idx] = NULL; } } EXPORT_SYMBOL_GPL(can_free_echo_skb); /* fill common values for CAN sk_buffs */ static void init_can_skb_reserve(struct sk_buff *skb) { skb->pkt_type = PACKET_BROADCAST; skb->ip_summed = CHECKSUM_UNNECESSARY; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); can_skb_reserve(skb); can_skb_prv(skb)->skbcnt = 0; } struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) { struct sk_buff *skb; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + sizeof(struct can_frame)); if (unlikely(!skb)) { *cf = NULL; return NULL; } skb->protocol = htons(ETH_P_CAN); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cf = skb_put_zero(skb, sizeof(struct can_frame)); return skb; } EXPORT_SYMBOL_GPL(alloc_can_skb); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd) { struct sk_buff *skb; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + sizeof(struct canfd_frame)); if (unlikely(!skb)) { *cfd = NULL; return NULL; } skb->protocol = htons(ETH_P_CANFD); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cfd = skb_put_zero(skb, sizeof(struct canfd_frame)); /* set CAN FD flag by default */ (*cfd)->flags = CANFD_FDF; return skb; } EXPORT_SYMBOL_GPL(alloc_canfd_skb); struct sk_buff *alloc_canxl_skb(struct net_device *dev, struct canxl_frame **cxl, unsigned int data_len) { struct sk_buff *skb; if (data_len < CANXL_MIN_DLEN || data_len > CANXL_MAX_DLEN) goto out_error; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + CANXL_HDR_SIZE + data_len); if (unlikely(!skb)) goto out_error; skb->protocol = htons(ETH_P_CANXL); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cxl = skb_put_zero(skb, CANXL_HDR_SIZE + data_len); /* set CAN XL flag and length information by default */ (*cxl)->flags = CANXL_XLF; (*cxl)->len = data_len; return skb; out_error: *cxl = NULL; return NULL; } EXPORT_SYMBOL_GPL(alloc_canxl_skb); struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf) { struct sk_buff *skb; skb = alloc_can_skb(dev, cf); if (unlikely(!skb)) return NULL; (*cf)->can_id = CAN_ERR_FLAG; (*cf)->len = CAN_ERR_DLC; return skb; } EXPORT_SYMBOL_GPL(alloc_can_err_skb); /* Check for outgoing skbs that have not been created by the CAN subsystem */ static bool can_skb_headroom_valid(struct net_device *dev, struct sk_buff *skb) { /* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */ if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv))) return false; /* af_packet does not apply CAN skb specific settings */ if (skb->ip_summed == CHECKSUM_NONE) { /* init headroom */ can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb->ip_summed = CHECKSUM_UNNECESSARY; /* perform proper loopback on capable devices */ if (dev->flags & IFF_ECHO) skb->pkt_type = PACKET_LOOPBACK; else skb->pkt_type = PACKET_HOST; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); /* set CANFD_FDF flag for CAN FD frames */ if (can_is_canfd_skb(skb)) { struct canfd_frame *cfd; cfd = (struct canfd_frame *)skb->data; cfd->flags |= CANFD_FDF; } } return true; } /* Drop a given socketbuffer if it does not contain a valid CAN frame. */ bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb) { switch (ntohs(skb->protocol)) { case ETH_P_CAN: if (!can_is_can_skb(skb)) goto inval_skb; break; case ETH_P_CANFD: if (!can_is_canfd_skb(skb)) goto inval_skb; break; case ETH_P_CANXL: if (!can_is_canxl_skb(skb)) goto inval_skb; break; default: goto inval_skb; } if (!can_skb_headroom_valid(dev, skb)) goto inval_skb; return false; inval_skb: kfree_skb(skb); dev->stats.tx_dropped++; return true; } EXPORT_SYMBOL_GPL(can_dropped_invalid_skb);
7914 854 40 854 6077 601 7914 138 184 6603 8674 8682 7918 6603 39 37 7641 246 5326 3677 3 7929 601 7924 7921 246 7916 6621 440 782 6604 6601 380 7928 7919 601 601 246 246 6619 440 4 344 13499 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/atomic.h> #include <linux/huge_mm.h> #include <linux/mm_types.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> #include <linux/swapops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? * @folio: The folio to test. * * We would like to get this info without a page flag, but the state * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. * * Return: An integer (not a boolean!) used to sort a folio onto the * right LRU list and to account folios correctly. * 1 if @folio is a regular filesystem backed page cache folio * or a lazily freed anonymous folio (e.g. via MADV_FREE). * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ static inline int folio_is_file_lru(struct folio *folio) { return !folio_test_swapbacked(folio); } static inline int page_is_file_lru(struct page *page) { return folio_is_file_lru(page_folio(page)); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } /** * __folio_clear_lru_flags - Clear page lru flags before releasing a page. * @folio: The folio that was on lru and now has a zero reference. */ static __always_inline void __folio_clear_lru_flags(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ if (folio_test_active(folio) && folio_test_unevictable(folio)) return; __folio_clear_active(folio); __folio_clear_unevictable(folio); } /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. * * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } #endif static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; } static inline int lru_hist_from_seq(unsigned long seq) { return seq % NR_HIST_GENS; } static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); /* see the comment on MAX_NR_TIERS */ return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); if (!(flags & BIT(PG_referenced))) return 0; /* * Return the total number of accesses including PG_referenced. Also see * the comment on LRU_REFS_FLAGS. */ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on MIN_NR_GENS */ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); } static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); if (old_gen >= 0) WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], lrugen->nr_pages[old_gen][type][zone] - delta); if (new_gen >= 0) WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], lrugen->nr_pages[new_gen][type][zone] + delta); /* addition */ if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; } /* deletion */ if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ * | Accessed through page tables and | Accessed through file descriptors | * | promoted by folio_update_gen() | and protected by folio_inc_gen() | * +-----------------------------------+-----------------------------------+ * | PG_active (set while isolated) | | * +-----------------+-----------------+-----------------+-----------------+ * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | * +-----------------------------------+-----------------------------------+ * |<---------- MIN_NR_GENS ---------->| | * |<---------------------------- MAX_NR_GENS ---------------------------->| */ if (folio_test_active(folio)) gen = MIN_NR_GENS - folio_test_workingset(folio); else if (reclaiming) gen = MAX_NR_GENS; else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) gen = MIN_NR_GENS; else gen = MAX_NR_GENS - folio_test_workingset(folio); return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); if (folio_test_unevictable(folio) || !lrugen->enabled) return false; seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long flags; int gen = folio_lru_gen(folio); if (gen < 0) return false; VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); list_del(&folio->lru); return true; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; set_mask_bits(&new->flags, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) { return false; } static inline bool lru_gen_in_fault(void) { return false; } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { } #endif /* CONFIG_LRU_GEN */ static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, false)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, true)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_del_folio(lruvec, folio, false)) return; if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } #ifdef CONFIG_ANON_VMA_NAME /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { if (anon_name) kref_get(&anon_name->kref); } static inline void anon_vma_name_put(struct anon_vma_name *anon_name) { if (anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } static inline struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) { /* Prevent anon_name refcount saturation early on */ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { anon_vma_name_get(anon_name); return anon_name; } return anon_vma_name_alloc(anon_name->name); } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); if (anon_name) new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) { /* * Not using anon_vma_name because it generates a warning if mmap_lock * is not held, which might be the case here. */ anon_vma_name_put(vma->anon_name); } static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { if (anon_name1 == anon_name2) return true; return anon_name1 && anon_name2 && !strcmp(anon_name1->name, anon_name2->name); } #else /* CONFIG_ANON_VMA_NAME */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_anon_vma_name(struct vm_area_struct *vma) {} static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { return true; } #endif /* CONFIG_ANON_VMA_NAME */ void pfnmap_track_ctx_release(struct kref *ref); static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); } static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); /* * The only time this value is relevant is when there are indeed pages * to flush. And we'll only flush pages after changing them, which * requires the PTL. * * So the ordering here is: * * atomic_inc(&mm->tlb_flush_pending); * spin_lock(&ptl); * ... * set_pte_at(); * spin_unlock(&ptl); * * spin_lock(&ptl) * mm_tlb_flush_pending(); * .... * spin_unlock(&ptl); * * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * * Where the increment if constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. * * This very much relies on users (mm_tlb_flush_pending() and * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc * locks (PPC) the unlock of one doesn't order against the lock of * another PTL. * * The decrement is ordered by the flush_tlb_range(), such that * mm_tlb_flush_pending() will not return false unless all flushes have * completed. */ } static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* * See inc_tlb_flush_pending(). * * This cannot be smp_mb__before_atomic() because smp_mb() simply does * not order against TLB invalidate completion, which is what we need. * * Therefore we must rely on tlb_flush_*() to guarantee order. */ atomic_dec(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that * PTLs release and therefore ensures that if we observe the modified * PTE we must also observe the increment from inc_tlb_flush_pending(). * * That is, it only guarantees to return true if there is a flush * pending for _this_ PTL. */ return atomic_read(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_nested(struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL * for which there is a TLB flush pending in order to guarantee * we've seen both that PTE modification and the increment. * * (no requirement on actually still holding the PTL, that is irrelevant) */ return atomic_read(&mm->tlb_flush_pending) > 1; } #ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( swp_entry_t entry, struct vm_area_struct *dst_vma) { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) dstm |= PTE_MARKER_UFFD_WP; return dstm; } #endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already * cleared so we will never accidentally replace something valuable. Meanwhile * none pte also means we are not demoting the pte so tlb flushed is not needed. * E.g., when pte cleared the caller should have taken care of the tlb flush. * * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * * Returns true if an uffd-wp pte was installed, false otherwise. */ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { #ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole * thing, because when zapping either it means it's dropping the * page, or in TTU where the present pte will be quickly replaced * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) arm_uffd_pte = true; /* * A uffd-wp wr-protected swap pte. Note: this should even cover an * existing pte marker with uffd-wp bit set. */ if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } #endif return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) return false; return true; } #endif
7 6 5 43 26 43 3 2 67 43 44 5 5 6 29 31 31 27 25 31 35 35 33 2 35 35 1 35 4 4 4 16 5 12 25 1 4 2 6 11 1 10 2 15 3 3 15 11 13 12 8 4 5 1 8 35 15 15 15 43 7 6 5 26 43 9 1 76 1 8 77 31 31 31 20 7 4 30 32 32 31 11 19 30 4 25 3 1 22 71 70 3 28 68 63 63 53 2 1 2 3 8 38 2 39 1 5 2 3 45 43 8 1 3 4 2 1 2 3 5 7 4 1 2 6 3 2 2 22 13 1 1 1 1 1 4 19 1 18 6 12 18 5 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015, Sony Mobile Communications Inc. * Copyright (c) 2013, The Linux Foundation. All rights reserved. */ #include <linux/module.h> #include <linux/netlink.h> #include <linux/qrtr.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/spinlock.h> #include <linux/wait.h> #include <net/sock.h> #include "qrtr.h" #define QRTR_PROTO_VER_1 1 #define QRTR_PROTO_VER_2 3 /* auto-bind range */ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff #define QRTR_EPH_PORT_RANGE \ XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) #define QRTR_PORT_CTRL_LEGACY 0xffff /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @src_node_id: source node * @src_port_id: source port * @confirm_rx: boolean; whether a resume-tx packet should be send in reply * @size: length of packet, excluding this header * @dst_node_id: destination node * @dst_port_id: destination port */ struct qrtr_hdr_v1 { __le32 version; __le32 type; __le32 src_node_id; __le32 src_port_id; __le32 confirm_rx; __le32 size; __le32 dst_node_id; __le32 dst_port_id; } __packed; /** * struct qrtr_hdr_v2 - (I|R)PCrouter packet header later versions * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @flags: bitmask of QRTR_FLAGS_* * @optlen: length of optional header data * @size: length of packet, excluding this header and optlen * @src_node_id: source node * @src_port_id: source port * @dst_node_id: destination node * @dst_port_id: destination port */ struct qrtr_hdr_v2 { u8 version; u8 type; u8 flags; u8 optlen; __le32 size; __le16 src_node_id; __le16 src_port_id; __le16 dst_node_id; __le16 dst_port_id; }; #define QRTR_FLAGS_CONFIRM_RX BIT(0) struct qrtr_cb { u32 src_node; u32 src_port; u32 dst_node; u32 dst_port; u8 type; u8 confirm_rx; }; #define QRTR_HDR_MAX_SIZE max_t(size_t, sizeof(struct qrtr_hdr_v1), \ sizeof(struct qrtr_hdr_v2)) struct qrtr_sock { /* WARNING: sk must be the first member */ struct sock sk; struct sockaddr_qrtr us; struct sockaddr_qrtr peer; }; static inline struct qrtr_sock *qrtr_sk(struct sock *sk) { BUILD_BUG_ON(offsetof(struct qrtr_sock, sk) != 0); return container_of(sk, struct qrtr_sock, sk); } static unsigned int qrtr_local_nid = 1; /* for node ids */ static RADIX_TREE(qrtr_nodes, GFP_ATOMIC); static DEFINE_SPINLOCK(qrtr_nodes_lock); /* broadcast list */ static LIST_HEAD(qrtr_all_nodes); /* lock for qrtr_all_nodes and node reference */ static DEFINE_MUTEX(qrtr_node_lock); /* local port allocation management */ static DEFINE_XARRAY_ALLOC(qrtr_ports); /** * struct qrtr_node - endpoint node * @ep_lock: lock for endpoint management and callbacks * @ep: endpoint * @ref: reference count for node * @nid: node id * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port * @qrtr_tx_lock: lock for qrtr_tx_flow inserts * @rx_queue: receive queue * @item: list item for broadcast list */ struct qrtr_node { struct mutex ep_lock; struct qrtr_endpoint *ep; struct kref ref; unsigned int nid; struct radix_tree_root qrtr_tx_flow; struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */ struct sk_buff_head rx_queue; struct list_head item; }; /** * struct qrtr_tx_flow - tx flow control * @resume_tx: waiters for a resume tx from the remote * @pending: number of waiting senders * @tx_failed: indicates that a message with confirm_rx flag was lost */ struct qrtr_tx_flow { struct wait_queue_head resume_tx; int pending; int tx_failed; }; #define QRTR_TX_FLOW_HIGH 10 #define QRTR_TX_FLOW_LOW 5 static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static struct qrtr_sock *qrtr_port_lookup(int port); static void qrtr_port_put(struct qrtr_sock *ipc); /* Release node resources and free the node. * * Do not call directly, use qrtr_node_release. To be used with * kref_put_mutex. As such, the node mutex is expected to be locked on call. */ static void __qrtr_node_release(struct kref *kref) { struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); struct radix_tree_iter iter; struct qrtr_tx_flow *flow; unsigned long flags; void __rcu **slot; spin_lock_irqsave(&qrtr_nodes_lock, flags); /* If the node is a bridge for other nodes, there are possibly * multiple entries pointing to our released node, delete them all. */ radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { if (*slot == node) radix_tree_iter_delete(&qrtr_nodes, &iter, slot); } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); list_del(&node->item); mutex_unlock(&qrtr_node_lock); skb_queue_purge(&node->rx_queue); /* Free tx flow counters */ radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { flow = *slot; radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot); kfree(flow); } kfree(node); } /* Increment reference to node. */ static struct qrtr_node *qrtr_node_acquire(struct qrtr_node *node) { if (node) kref_get(&node->ref); return node; } /* Decrement reference to node and release as necessary. */ static void qrtr_node_release(struct qrtr_node *node) { if (!node) return; kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock); } /** * qrtr_tx_resume() - reset flow control counter * @node: qrtr_node that the QRTR_TYPE_RESUME_TX packet arrived on * @skb: resume_tx packet */ static void qrtr_tx_resume(struct qrtr_node *node, struct sk_buff *skb) { struct qrtr_ctrl_pkt *pkt = (struct qrtr_ctrl_pkt *)skb->data; u64 remote_node = le32_to_cpu(pkt->client.node); u32 remote_port = le32_to_cpu(pkt->client.port); struct qrtr_tx_flow *flow; unsigned long key; key = remote_node << 32 | remote_port; rcu_read_lock(); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); rcu_read_unlock(); if (flow) { spin_lock(&flow->resume_tx.lock); flow->pending = 0; spin_unlock(&flow->resume_tx.lock); wake_up_interruptible_all(&flow->resume_tx); } consume_skb(skb); } /** * qrtr_tx_wait() - flow control for outgoing packets * @node: qrtr_node that the packet is to be send to * @dest_node: node id of the destination * @dest_port: port number of the destination * @type: type of message * * The flow control scheme is based around the low and high "watermarks". When * the low watermark is passed the confirm_rx flag is set on the outgoing * message, which will trigger the remote to send a control message of the type * QRTR_TYPE_RESUME_TX to reset the counter. If the high watermark is hit * further transmision should be paused. * * Return: 1 if confirm_rx should be set, 0 otherwise or errno failure */ static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port, int type) { unsigned long key = (u64)dest_node << 32 | dest_port; struct qrtr_tx_flow *flow; int confirm_rx = 0; int ret; /* Never set confirm_rx on non-data packets */ if (type != QRTR_TYPE_DATA) return 0; mutex_lock(&node->qrtr_tx_lock); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); if (!flow) { flow = kzalloc(sizeof(*flow), GFP_KERNEL); if (flow) { init_waitqueue_head(&flow->resume_tx); if (radix_tree_insert(&node->qrtr_tx_flow, key, flow)) { kfree(flow); flow = NULL; } } } mutex_unlock(&node->qrtr_tx_lock); /* Set confirm_rx if we where unable to find and allocate a flow */ if (!flow) return 1; spin_lock_irq(&flow->resume_tx.lock); ret = wait_event_interruptible_locked_irq(flow->resume_tx, flow->pending < QRTR_TX_FLOW_HIGH || flow->tx_failed || !node->ep); if (ret < 0) { confirm_rx = ret; } else if (!node->ep) { confirm_rx = -EPIPE; } else if (flow->tx_failed) { flow->tx_failed = 0; confirm_rx = 1; } else { flow->pending++; confirm_rx = flow->pending == QRTR_TX_FLOW_LOW; } spin_unlock_irq(&flow->resume_tx.lock); return confirm_rx; } /** * qrtr_tx_flow_failed() - flag that tx of confirm_rx flagged messages failed * @node: qrtr_node that the packet is to be send to * @dest_node: node id of the destination * @dest_port: port number of the destination * * Signal that the transmission of a message with confirm_rx flag failed. The * flow's "pending" counter will keep incrementing towards QRTR_TX_FLOW_HIGH, * at which point transmission would stall forever waiting for the resume TX * message associated with the dropped confirm_rx message. * Work around this by marking the flow as having a failed transmission and * cause the next transmission attempt to be sent with the confirm_rx. */ static void qrtr_tx_flow_failed(struct qrtr_node *node, int dest_node, int dest_port) { unsigned long key = (u64)dest_node << 32 | dest_port; struct qrtr_tx_flow *flow; rcu_read_lock(); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); rcu_read_unlock(); if (flow) { spin_lock_irq(&flow->resume_tx.lock); flow->tx_failed = 1; spin_unlock_irq(&flow->resume_tx.lock); } } /* Pass an outgoing packet socket buffer to the endpoint driver. */ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct qrtr_hdr_v1 *hdr; size_t len = skb->len; int rc, confirm_rx; confirm_rx = qrtr_tx_wait(node, to->sq_node, to->sq_port, type); if (confirm_rx < 0) { kfree_skb(skb); return confirm_rx; } hdr = skb_push(skb, sizeof(*hdr)); hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); hdr->type = cpu_to_le32(type); hdr->src_node_id = cpu_to_le32(from->sq_node); hdr->src_port_id = cpu_to_le32(from->sq_port); if (to->sq_port == QRTR_PORT_CTRL) { hdr->dst_node_id = cpu_to_le32(node->nid); hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); } else { hdr->dst_node_id = cpu_to_le32(to->sq_node); hdr->dst_port_id = cpu_to_le32(to->sq_port); } hdr->size = cpu_to_le32(len); hdr->confirm_rx = !!confirm_rx; rc = skb_put_padto(skb, ALIGN(len, 4) + sizeof(*hdr)); if (!rc) { mutex_lock(&node->ep_lock); rc = -ENODEV; if (node->ep) rc = node->ep->xmit(node->ep, skb); else kfree_skb(skb); mutex_unlock(&node->ep_lock); } /* Need to ensure that a subsequent message carries the otherwise lost * confirm_rx flag if we dropped this one */ if (rc && confirm_rx) qrtr_tx_flow_failed(node, to->sq_node, to->sq_port); return rc; } /* Lookup node by id. * * callers must release with qrtr_node_release() */ static struct qrtr_node *qrtr_node_lookup(unsigned int nid) { struct qrtr_node *node; unsigned long flags; mutex_lock(&qrtr_node_lock); spin_lock_irqsave(&qrtr_nodes_lock, flags); node = radix_tree_lookup(&qrtr_nodes, nid); node = qrtr_node_acquire(node); spin_unlock_irqrestore(&qrtr_nodes_lock, flags); mutex_unlock(&qrtr_node_lock); return node; } /* Assign node id to node. * * This is mostly useful for automatic node id assignment, based on * the source id in the incoming packet. */ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) { unsigned long flags; if (nid == QRTR_EP_NID_AUTO) return; spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_insert(&qrtr_nodes, nid, node); if (node->nid == QRTR_EP_NID_AUTO) node->nid = nid; spin_unlock_irqrestore(&qrtr_nodes_lock, flags); } /** * qrtr_endpoint_post() - post incoming data * @ep: endpoint handle * @data: data pointer * @len: size of data in bytes * * Return: 0 on success; negative error code on failure */ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) { struct qrtr_node *node = ep->node; const struct qrtr_hdr_v1 *v1; const struct qrtr_hdr_v2 *v2; struct qrtr_sock *ipc; struct sk_buff *skb; struct qrtr_cb *cb; size_t size; unsigned int ver; size_t hdrlen; if (len == 0 || len & 3) return -EINVAL; skb = __netdev_alloc_skb(NULL, len, GFP_ATOMIC | __GFP_NOWARN); if (!skb) return -ENOMEM; cb = (struct qrtr_cb *)skb->cb; /* Version field in v1 is little endian, so this works for both cases */ ver = *(u8*)data; switch (ver) { case QRTR_PROTO_VER_1: if (len < sizeof(*v1)) goto err; v1 = data; hdrlen = sizeof(*v1); cb->type = le32_to_cpu(v1->type); cb->src_node = le32_to_cpu(v1->src_node_id); cb->src_port = le32_to_cpu(v1->src_port_id); cb->confirm_rx = !!v1->confirm_rx; cb->dst_node = le32_to_cpu(v1->dst_node_id); cb->dst_port = le32_to_cpu(v1->dst_port_id); size = le32_to_cpu(v1->size); break; case QRTR_PROTO_VER_2: if (len < sizeof(*v2)) goto err; v2 = data; hdrlen = sizeof(*v2) + v2->optlen; cb->type = v2->type; cb->confirm_rx = !!(v2->flags & QRTR_FLAGS_CONFIRM_RX); cb->src_node = le16_to_cpu(v2->src_node_id); cb->src_port = le16_to_cpu(v2->src_port_id); cb->dst_node = le16_to_cpu(v2->dst_node_id); cb->dst_port = le16_to_cpu(v2->dst_port_id); if (cb->src_port == (u16)QRTR_PORT_CTRL) cb->src_port = QRTR_PORT_CTRL; if (cb->dst_port == (u16)QRTR_PORT_CTRL) cb->dst_port = QRTR_PORT_CTRL; size = le32_to_cpu(v2->size); break; default: pr_err("qrtr: Invalid version %d\n", ver); goto err; } if (cb->dst_port == QRTR_PORT_CTRL_LEGACY) cb->dst_port = QRTR_PORT_CTRL; if (!size || len != ALIGN(size, 4) + hdrlen) goto err; if ((cb->type == QRTR_TYPE_NEW_SERVER || cb->type == QRTR_TYPE_RESUME_TX) && size < sizeof(struct qrtr_ctrl_pkt)) goto err; if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && cb->type != QRTR_TYPE_RESUME_TX) goto err; skb_put_data(skb, data + hdrlen, size); qrtr_node_assign(node, cb->src_node); if (cb->type == QRTR_TYPE_NEW_SERVER) { /* Remote node endpoint can bridge other distant nodes */ const struct qrtr_ctrl_pkt *pkt; pkt = data + hdrlen; qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); } if (cb->type == QRTR_TYPE_RESUME_TX) { qrtr_tx_resume(node, skb); } else { ipc = qrtr_port_lookup(cb->dst_port); if (!ipc) goto err; if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); goto err; } qrtr_port_put(ipc); } return 0; err: kfree_skb(skb); return -EINVAL; } EXPORT_SYMBOL_GPL(qrtr_endpoint_post); /** * qrtr_alloc_ctrl_packet() - allocate control packet skb * @pkt: reference to qrtr_ctrl_pkt pointer * @flags: the type of memory to allocate * * Returns newly allocated sk_buff, or NULL on failure * * This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and * on success returns a reference to the control packet in @pkt. */ static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt, gfp_t flags) { const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, flags); if (!skb) return NULL; skb_reserve(skb, QRTR_HDR_MAX_SIZE); *pkt = skb_put_zero(skb, pkt_len); return skb; } /** * qrtr_endpoint_register() - register a new endpoint * @ep: endpoint to register * @nid: desired node id; may be QRTR_EP_NID_AUTO for auto-assignment * Return: 0 on success; negative error code on failure * * The specified endpoint must have the xmit function pointer set on call. */ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) { struct qrtr_node *node; if (!ep || !ep->xmit) return -EINVAL; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; kref_init(&node->ref); mutex_init(&node->ep_lock); skb_queue_head_init(&node->rx_queue); node->nid = QRTR_EP_NID_AUTO; node->ep = ep; INIT_RADIX_TREE(&node->qrtr_tx_flow, GFP_KERNEL); mutex_init(&node->qrtr_tx_lock); qrtr_node_assign(node, nid); mutex_lock(&qrtr_node_lock); list_add(&node->item, &qrtr_all_nodes); mutex_unlock(&qrtr_node_lock); ep->node = node; return 0; } EXPORT_SYMBOL_GPL(qrtr_endpoint_register); /** * qrtr_endpoint_unregister - unregister endpoint * @ep: endpoint to unregister */ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) { struct qrtr_node *node = ep->node; struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; struct radix_tree_iter iter; struct qrtr_ctrl_pkt *pkt; struct qrtr_tx_flow *flow; struct sk_buff *skb; unsigned long flags; void __rcu **slot; mutex_lock(&node->ep_lock); node->ep = NULL; mutex_unlock(&node->ep_lock); /* Notify the local controller about the event */ spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { if (*slot != node) continue; src.sq_node = iter.index; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_ATOMIC); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); } } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); /* Wake up any transmitters waiting for resume-tx from the node */ mutex_lock(&node->qrtr_tx_lock); radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { flow = *slot; wake_up_interruptible_all(&flow->resume_tx); } mutex_unlock(&node->qrtr_tx_lock); qrtr_node_release(node); ep->node = NULL; } EXPORT_SYMBOL_GPL(qrtr_endpoint_unregister); /* Lookup socket by port. * * Callers must release with qrtr_port_put() */ static struct qrtr_sock *qrtr_port_lookup(int port) { struct qrtr_sock *ipc; if (port == QRTR_PORT_CTRL) port = 0; rcu_read_lock(); ipc = xa_load(&qrtr_ports, port); if (ipc) sock_hold(&ipc->sk); rcu_read_unlock(); return ipc; } /* Release acquired socket. */ static void qrtr_port_put(struct qrtr_sock *ipc) { sock_put(&ipc->sk); } /* Remove port assignment. */ static void qrtr_port_remove(struct qrtr_sock *ipc) { struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; int port = ipc->us.sq_port; struct sockaddr_qrtr to; to.sq_family = AF_QIPCRTR; to.sq_node = QRTR_NODE_BCAST; to.sq_port = QRTR_PORT_CTRL; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); pkt->client.node = cpu_to_le32(ipc->us.sq_node); pkt->client.port = cpu_to_le32(ipc->us.sq_port); skb_set_owner_w(skb, &ipc->sk); qrtr_bcast_enqueue(NULL, skb, QRTR_TYPE_DEL_CLIENT, &ipc->us, &to); } if (port == QRTR_PORT_CTRL) port = 0; __sock_put(&ipc->sk); xa_erase(&qrtr_ports, port); /* Ensure that if qrtr_port_lookup() did enter the RCU read section we * wait for it to up increment the refcount */ synchronize_rcu(); } /* Assign port number to socket. * * Specify port in the integer pointed to by port, and it will be adjusted * on return as necesssary. * * Port may be: * 0: Assign ephemeral port in [QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET] * <QRTR_MIN_EPH_SOCKET: Specified; requires CAP_NET_ADMIN * >QRTR_MIN_EPH_SOCKET: Specified; available to all */ static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) { int rc; if (!*port) { rc = xa_alloc(&qrtr_ports, port, ipc, QRTR_EPH_PORT_RANGE, GFP_KERNEL); } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) { rc = -EACCES; } else if (*port == QRTR_PORT_CTRL) { rc = xa_insert(&qrtr_ports, 0, ipc, GFP_KERNEL); } else { rc = xa_insert(&qrtr_ports, *port, ipc, GFP_KERNEL); } if (rc == -EBUSY) return -EADDRINUSE; else if (rc < 0) return rc; sock_hold(&ipc->sk); return 0; } /* Reset all non-control ports */ static void qrtr_reset_ports(void) { struct qrtr_sock *ipc; unsigned long index; rcu_read_lock(); xa_for_each_start(&qrtr_ports, index, ipc, 1) { sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; sk_error_report(&ipc->sk); sock_put(&ipc->sk); } rcu_read_unlock(); } /* Bind socket to address. * * Socket should be locked upon call. */ static int __qrtr_bind(struct socket *sock, const struct sockaddr_qrtr *addr, int zapped) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int port; int rc; /* rebinding ok */ if (!zapped && addr->sq_port == ipc->us.sq_port) return 0; port = addr->sq_port; rc = qrtr_port_assign(ipc, &port); if (rc) return rc; /* unbind previous, if any */ if (!zapped) qrtr_port_remove(ipc); ipc->us.sq_port = port; sock_reset_flag(sk, SOCK_ZAPPED); /* Notify all open ports about the new controller */ if (port == QRTR_PORT_CTRL) qrtr_reset_ports(); return 0; } /* Auto bind to an ephemeral port. */ static int qrtr_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct sockaddr_qrtr addr; if (!sock_flag(sk, SOCK_ZAPPED)) return 0; addr.sq_family = AF_QIPCRTR; addr.sq_node = qrtr_local_nid; addr.sq_port = 0; return __qrtr_bind(sock, &addr, 1); } /* Bind socket to specified sockaddr. */ static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int rc; if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) return -EINVAL; if (addr->sq_node != ipc->us.sq_node) return -EINVAL; lock_sock(sk); rc = __qrtr_bind(sock, addr, sock_flag(sk, SOCK_ZAPPED)); release_sock(sk); return rc; } /* Queue packet to local peer socket. */ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct qrtr_sock *ipc; struct qrtr_cb *cb; ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ if (ipc) qrtr_port_put(ipc); kfree_skb(skb); return -ENODEV; } cb = (struct qrtr_cb *)skb->cb; cb->src_node = from->sq_node; cb->src_port = from->sq_port; if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); kfree_skb(skb); return -ENOSPC; } qrtr_port_put(ipc); return 0; } /* Queue packet for broadcast. */ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct sk_buff *skbn; mutex_lock(&qrtr_node_lock); list_for_each_entry(node, &qrtr_all_nodes, item) { skbn = pskb_copy(skb, GFP_KERNEL); if (!skbn) break; skb_set_owner_w(skbn, skb->sk); qrtr_node_enqueue(node, skbn, type, from, to); } mutex_unlock(&qrtr_node_lock); qrtr_local_enqueue(NULL, skb, type, from, to); return 0; } static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *, int, struct sockaddr_qrtr *, struct sockaddr_qrtr *); __le32 qrtr_type = cpu_to_le32(QRTR_TYPE_DATA); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct qrtr_node *node; struct sk_buff *skb; size_t plen; u32 type; int rc; if (msg->msg_flags & ~(MSG_DONTWAIT)) return -EINVAL; if (len > 65535) return -EMSGSIZE; lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) { release_sock(sk); return -EINVAL; } if (addr->sq_family != AF_QIPCRTR) { release_sock(sk); return -EINVAL; } rc = qrtr_autobind(sock); if (rc) { release_sock(sk); return rc; } } else if (sk->sk_state == TCP_ESTABLISHED) { addr = &ipc->peer; } else { release_sock(sk); return -ENOTCONN; } node = NULL; if (addr->sq_node == QRTR_NODE_BCAST) { if (addr->sq_port != QRTR_PORT_CTRL && qrtr_local_nid != QRTR_NODE_BCAST) { release_sock(sk); return -ENOTCONN; } enqueue_fn = qrtr_bcast_enqueue; } else if (addr->sq_node == ipc->us.sq_node) { enqueue_fn = qrtr_local_enqueue; } else { node = qrtr_node_lookup(addr->sq_node); if (!node) { release_sock(sk); return -ECONNRESET; } enqueue_fn = qrtr_node_enqueue; } plen = (len + 3) & ~3; skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_MAX_SIZE, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) { rc = -ENOMEM; goto out_node; } skb_reserve(skb, QRTR_HDR_MAX_SIZE); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc) { kfree_skb(skb); goto out_node; } if (ipc->us.sq_port == QRTR_PORT_CTRL) { if (len < 4) { rc = -EINVAL; kfree_skb(skb); goto out_node; } /* control messages already require the type as 'command' */ skb_copy_bits(skb, 0, &qrtr_type, 4); } type = le32_to_cpu(qrtr_type); rc = enqueue_fn(node, skb, type, &ipc->us, addr); if (rc >= 0) rc = len; out_node: qrtr_node_release(node); release_sock(sk); return rc; } static int qrtr_send_resume_tx(struct qrtr_cb *cb) { struct sockaddr_qrtr remote = { AF_QIPCRTR, cb->src_node, cb->src_port }; struct sockaddr_qrtr local = { AF_QIPCRTR, cb->dst_node, cb->dst_port }; struct qrtr_ctrl_pkt *pkt; struct qrtr_node *node; struct sk_buff *skb; int ret; node = qrtr_node_lookup(remote.sq_node); if (!node) return -EINVAL; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (!skb) return -ENOMEM; pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); pkt->client.node = cpu_to_le32(cb->dst_node); pkt->client.port = cpu_to_le32(cb->dst_port); ret = qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, &local, &remote); qrtr_node_release(node); return ret; } static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); struct sock *sk = sock->sk; struct sk_buff *skb; struct qrtr_cb *cb; int copied, rc; lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { release_sock(sk); return -EADDRNOTAVAIL; } skb = skb_recv_datagram(sk, flags, &rc); if (!skb) { release_sock(sk); return rc; } cb = (struct qrtr_cb *)skb->cb; copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc < 0) goto out; rc = copied; if (addr) { /* There is an anonymous 2-byte hole after sq_family, * make sure to clear it. */ memset(addr, 0, sizeof(*addr)); addr->sq_family = AF_QIPCRTR; addr->sq_node = cb->src_node; addr->sq_port = cb->src_port; msg->msg_namelen = sizeof(*addr); } out: if (cb->confirm_rx) qrtr_send_resume_tx(cb); skb_free_datagram(sk, skb); release_sock(sk); return rc; } static int qrtr_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int rc; if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) return -EINVAL; lock_sock(sk); sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = qrtr_autobind(sock); if (rc) { release_sock(sk); return rc; } ipc->peer = *addr; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; release_sock(sk); return 0; } static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, int peer) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sockaddr_qrtr qaddr; struct sock *sk = sock->sk; lock_sock(sk); if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } qaddr = ipc->peer; } else { qaddr = ipc->us; } release_sock(sk); qaddr.sq_family = AF_QIPCRTR; memcpy(saddr, &qaddr, sizeof(qaddr)); return sizeof(qaddr); } static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct sockaddr_qrtr *sq; struct sk_buff *skb; struct ifreq ifr; long len = 0; int rc = 0; lock_sock(sk); switch (cmd) { case TIOCOUTQ: len = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (len < 0) len = 0; rc = put_user(len, (int __user *)argp); break; case TIOCINQ: skb = skb_peek(&sk->sk_receive_queue); if (skb) len = skb->len; rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: if (get_user_ifreq(&ifr, NULL, argp)) { rc = -EFAULT; break; } sq = (struct sockaddr_qrtr *)&ifr.ifr_addr; *sq = ipc->us; if (put_user_ifreq(&ifr, argp)) { rc = -EFAULT; break; } break; case SIOCADDRT: case SIOCDELRT: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: rc = -EINVAL; break; default: rc = -ENOIOCTLCMD; break; } release_sock(sk); return rc; } static int qrtr_release(struct socket *sock) { struct sock *sk = sock->sk; struct qrtr_sock *ipc; if (!sk) return 0; lock_sock(sk); ipc = qrtr_sk(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_orphan(sk); sock->sk = NULL; if (!sock_flag(sk, SOCK_ZAPPED)) qrtr_port_remove(ipc); skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static const struct proto_ops qrtr_proto_ops = { .owner = THIS_MODULE, .family = AF_QIPCRTR, .bind = qrtr_bind, .connect = qrtr_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .listen = sock_no_listen, .sendmsg = qrtr_sendmsg, .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, .gettstamp = sock_gettstamp, .poll = datagram_poll, .shutdown = sock_no_shutdown, .release = qrtr_release, .mmap = sock_no_mmap, }; static struct proto qrtr_proto = { .name = "QIPCRTR", .owner = THIS_MODULE, .obj_size = sizeof(struct qrtr_sock), }; static int qrtr_create(struct net *net, struct socket *sock, int protocol, int kern) { struct qrtr_sock *ipc; struct sock *sk; if (sock->type != SOCK_DGRAM) return -EPROTOTYPE; sk = sk_alloc(net, AF_QIPCRTR, GFP_KERNEL, &qrtr_proto, kern); if (!sk) return -ENOMEM; sock_set_flag(sk, SOCK_ZAPPED); sock_init_data(sock, sk); sock->ops = &qrtr_proto_ops; ipc = qrtr_sk(sk); ipc->us.sq_family = AF_QIPCRTR; ipc->us.sq_node = qrtr_local_nid; ipc->us.sq_port = 0; return 0; } static const struct net_proto_family qrtr_family = { .owner = THIS_MODULE, .family = AF_QIPCRTR, .create = qrtr_create, }; static int __init qrtr_proto_init(void) { int rc; rc = proto_register(&qrtr_proto, 1); if (rc) return rc; rc = sock_register(&qrtr_family); if (rc) goto err_proto; rc = qrtr_ns_init(); if (rc) goto err_sock; return 0; err_sock: sock_unregister(qrtr_family.family); err_proto: proto_unregister(&qrtr_proto); return rc; } postcore_initcall(qrtr_proto_init); static void __exit qrtr_proto_fini(void) { qrtr_ns_remove(); sock_unregister(qrtr_family.family); proto_unregister(&qrtr_proto); } module_exit(qrtr_proto_fini); MODULE_DESCRIPTION("Qualcomm IPC-router driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_NETPROTO(PF_QIPCRTR);
1 3 3 9 29 7 1 3 3 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2003-2008 Takahiro Hirofuchi */ #include <linux/kthread.h> #include <linux/slab.h> #include "usbip_common.h" #include "vhci.h" /* get URB from transmitted urb queue. caller must hold vdev->priv_lock */ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum) { struct vhci_priv *priv, *tmp; struct urb *urb = NULL; int status; list_for_each_entry_safe(priv, tmp, &vdev->priv_rx, list) { if (priv->seqnum != seqnum) continue; urb = priv->urb; status = urb->status; usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum); switch (status) { case -ENOENT: fallthrough; case -ECONNRESET: dev_dbg(&urb->dev->dev, "urb seq# %u was unlinked %ssynchronously\n", seqnum, status == -ENOENT ? "" : "a"); break; case -EINPROGRESS: /* no info output */ break; default: dev_dbg(&urb->dev->dev, "urb seq# %u may be in a error, status %d\n", seqnum, status); } list_del(&priv->list); kfree(priv); urb->hcpriv = NULL; break; } return urb; } static void vhci_recv_ret_submit(struct vhci_device *vdev, struct usbip_header *pdu) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct vhci *vhci = vhci_hcd->vhci; struct usbip_device *ud = &vdev->ud; struct urb *urb; unsigned long flags; spin_lock_irqsave(&vdev->priv_lock, flags); urb = pickup_urb_and_free_priv(vdev, pdu->base.seqnum); spin_unlock_irqrestore(&vdev->priv_lock, flags); if (!urb) { pr_err("cannot find a urb of seqnum %u max seqnum %u\n", pdu->base.seqnum, atomic_read(&vhci_hcd->seqnum)); usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return; } /* unpack the pdu to a urb */ usbip_pack_pdu(pdu, urb, USBIP_RET_SUBMIT, 0); /* recv transfer buffer */ if (usbip_recv_xbuff(ud, urb) < 0) { urb->status = -EPROTO; goto error; } /* recv iso_packet_descriptor */ if (usbip_recv_iso(ud, urb) < 0) { urb->status = -EPROTO; goto error; } /* restore the padding in iso packets */ usbip_pad_iso(ud, urb); error: if (usbip_dbg_flag_vhci_rx) usbip_dump_urb(urb); if (urb->num_sgs) urb->transfer_flags &= ~URB_DMA_MAP_SG; usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum); spin_lock_irqsave(&vhci->lock, flags); usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb); spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_giveback_urb(vhci_hcd_to_hcd(vhci_hcd), urb, urb->status); usbip_dbg_vhci_rx("Leave\n"); } static struct vhci_unlink *dequeue_pending_unlink(struct vhci_device *vdev, struct usbip_header *pdu) { struct vhci_unlink *unlink, *tmp; unsigned long flags; spin_lock_irqsave(&vdev->priv_lock, flags); list_for_each_entry_safe(unlink, tmp, &vdev->unlink_rx, list) { pr_info("unlink->seqnum %lu\n", unlink->seqnum); if (unlink->seqnum == pdu->base.seqnum) { usbip_dbg_vhci_rx("found pending unlink, %lu\n", unlink->seqnum); list_del(&unlink->list); spin_unlock_irqrestore(&vdev->priv_lock, flags); return unlink; } } spin_unlock_irqrestore(&vdev->priv_lock, flags); return NULL; } static void vhci_recv_ret_unlink(struct vhci_device *vdev, struct usbip_header *pdu) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct vhci *vhci = vhci_hcd->vhci; struct vhci_unlink *unlink; struct urb *urb; unsigned long flags; usbip_dump_header(pdu); unlink = dequeue_pending_unlink(vdev, pdu); if (!unlink) { pr_info("cannot find the pending unlink %u\n", pdu->base.seqnum); return; } spin_lock_irqsave(&vdev->priv_lock, flags); urb = pickup_urb_and_free_priv(vdev, unlink->unlink_seqnum); spin_unlock_irqrestore(&vdev->priv_lock, flags); if (!urb) { /* * I get the result of a unlink request. But, it seems that I * already received the result of its submit result and gave * back the URB. */ pr_info("the urb (seqnum %u) was already given back\n", pdu->base.seqnum); } else { usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum); /* If unlink is successful, status is -ECONNRESET */ urb->status = pdu->u.ret_unlink.status; pr_info("urb->status %d\n", urb->status); spin_lock_irqsave(&vhci->lock, flags); usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb); spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_giveback_urb(vhci_hcd_to_hcd(vhci_hcd), urb, urb->status); } kfree(unlink); } static int vhci_priv_tx_empty(struct vhci_device *vdev) { int empty = 0; unsigned long flags; spin_lock_irqsave(&vdev->priv_lock, flags); empty = list_empty(&vdev->priv_rx); spin_unlock_irqrestore(&vdev->priv_lock, flags); return empty; } /* recv a pdu */ static void vhci_rx_pdu(struct usbip_device *ud) { int ret; struct usbip_header pdu; struct vhci_device *vdev = container_of(ud, struct vhci_device, ud); usbip_dbg_vhci_rx("Enter\n"); memset(&pdu, 0, sizeof(pdu)); /* receive a pdu header */ ret = usbip_recv(ud->tcp_socket, &pdu, sizeof(pdu)); if (ret < 0) { if (ret == -ECONNRESET) pr_info("connection reset by peer\n"); else if (ret == -EAGAIN) { /* ignore if connection was idle */ if (vhci_priv_tx_empty(vdev)) return; pr_info("connection timed out with pending urbs\n"); } else if (ret != -ERESTARTSYS) pr_info("xmit failed %d\n", ret); usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return; } if (ret == 0) { pr_info("connection closed"); usbip_event_add(ud, VDEV_EVENT_DOWN); return; } if (ret != sizeof(pdu)) { pr_err("received pdu size is %d, should be %d\n", ret, (unsigned int)sizeof(pdu)); usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return; } usbip_header_correct_endian(&pdu, 0); if (usbip_dbg_flag_vhci_rx) usbip_dump_header(&pdu); switch (pdu.base.command) { case USBIP_RET_SUBMIT: vhci_recv_ret_submit(vdev, &pdu); break; case USBIP_RET_UNLINK: vhci_recv_ret_unlink(vdev, &pdu); break; default: /* NOT REACHED */ pr_err("unknown pdu %u\n", pdu.base.command); usbip_dump_header(&pdu); usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); break; } } int vhci_rx_loop(void *data) { struct usbip_device *ud = data; while (!kthread_should_stop()) { if (usbip_event_happened(ud)) break; usbip_kcov_remote_start(ud); vhci_rx_pdu(ud); usbip_kcov_remote_stop(); } return 0; }
291 291 291 12 18548 18547 18552 18646 18452 278 278 17996 823 816 6 824 822 824 221 213 12 4844 4838 12471 933 26611 49 26625 4841 4841 4842 23710 23703 23763 9499 17097 17109 49 14212 258 3691 28 3676 3677 25 5750 9394 8882 274 8715 11700 237 221 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/sched/task_stack.h> /* task_stack_*(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/memblock.h> /* max_low_pfn */ #include <linux/kfence.h> /* kfence_handle_page_fault */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> #include <linux/mm.h> /* find_and_lock_vma() */ #include <linux/vmalloc.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> #include <asm/fred.h> #include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS #include <trace/events/exceptions.h> /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ static nokprobe_inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; return 0; } /* * Prefetch quirks: * * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. * Check that here and ignore it. This is AMD erratum #91. * * 64-bit mode: * * Sometimes the CPU reports invalid exceptions on prefetch. * Check that here and ignore it. * * Opcode checker based on code by Richard Brunner. */ static inline int check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, unsigned char opcode, int *prefetch) { unsigned char instr_hi = opcode & 0xf0; unsigned char instr_lo = opcode & 0x0f; switch (instr_hi) { case 0x20: case 0x30: /* * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. * In X86_64 long mode, the CPU will signal invalid * opcode if some of these prefixes are present so * X86_64 will never get here anyway */ return ((instr_lo & 7) == 0x6); #ifdef CONFIG_X86_64 case 0x40: /* * In 64-bit mode 0x40..0x4F are valid REX prefixes */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ return (instr_lo & 0xC) == 0x4; case 0xF0: /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ return !instr_lo || (instr_lo>>1) == 1; case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ if (get_kernel_nofault(opcode, instr)) return 0; *prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); return 0; default: return 0; } } static bool is_amd_k8_pre_npt(void) { struct cpuinfo_x86 *c = &boot_cpu_data; return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && c->x86_vendor == X86_VENDOR_AMD && c->x86 == 0xf && c->x86_model < 0x40); } static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; int prefetch = 0; /* Erratum #91 affects AMD K8, pre-NPT CPUs */ if (!is_amd_k8_pre_npt()) return 0; /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ if (error_code & X86_PF_INSTR) return 0; instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; /* * This code has historically always bailed out if IP points to a * not-present page (e.g. due to a race). No one has ever * complained about this. */ pagefault_disable(); while (instr < max_instr) { unsigned char opcode; if (user_mode(regs)) { if (get_user(opcode, (unsigned char __user *) instr)) break; } else { if (get_kernel_nofault(opcode, instr)) break; } instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } pagefault_enable(); return prefetch; } DEFINE_SPINLOCK(pgd_lock); LIST_HEAD(pgd_list); #ifdef CONFIG_X86_32 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pgd += index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) return NULL; /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would * set_p4d/set_pud. */ p4d = p4d_offset(pgd, address); p4d_k = p4d_offset(pgd_k, address); if (!p4d_present(*p4d_k)) return NULL; pud = pud_offset(p4d, address); pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (pmd_present(*pmd) != pmd_present(*pmd_k)) set_pmd(pmd, *pmd_k); if (!pmd_present(*pmd_k)) return NULL; else BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); return pmd_k; } /* * Handle a fault on the vmalloc or module mapping area * * This is needed because there is a race condition between the time * when the vmalloc mapping code updates the PMD to the point in time * where it synchronizes this update with the other page-tables in the * system. * * In this race window another thread/CPU can map an area on the same * PMD, finds it already present and does not synchronize it with the * rest of the system yet. As a result v[mz]alloc might return areas * which are not mapped in every page-table in the system, causing an * unhandled page-fault when they are accessed. */ static noinline int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ pgd_paddr = read_cr3_pa(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; if (pmd_leaf(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; return 0; } NOKPROBE_SYMBOL(vmalloc_fault); void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; for (addr = start & PMD_MASK; addr >= TASK_SIZE_MAX && addr < VMALLOC_END; addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(address)]; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; #ifdef CONFIG_X86_PAE pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #define pr_pde pr_cont #else #define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); #undef pr_pde /* * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: pr_cont("\n"); } #else /* CONFIG_X86_64: */ #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" "******* Working around it, but it may cause SEGVs or burn power.\n" "******* Please consider a BIOS update.\n" "******* Disabling USB legacy in the BIOS may also help.\n"; #endif static int bad_address(void *p) { unsigned long dummy; return get_kernel_nofault(dummy, (unsigned long *)p); } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = base + pgd_index(address); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; if (bad_address(pgd)) goto bad; pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (bad_address(p4d)) goto bad; pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_leaf(*p4d)) goto out; pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_leaf(*pud)) goto out; pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); if (bad_address(pte)) goto bad; pr_cont("PTE %lx", pte_val(*pte)); out: pr_cont("\n"); return; bad: pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. * * BIOS SMM functions are required to use a specific workaround * to avoid corruption of the 64bit RIP register on C stepping K8. * * A lot of BIOS that didn't get tested properly miss this. * * The OS sees this as a page fault with the upper 32bits of RIP cleared. * Try to work around it here. * * Note we only handle faults in kernel here. * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 != 0xf) return 0; if (user_mode(regs)) return 0; if (address != regs->ip) return 0; if ((address >> 32) != 0) return 0; address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { printk_once(errata93_warning); regs->ip = address; return 1; } #endif return 0; } /* * Work around K8 erratum #100 K8 in compat mode occasionally jumps * to illegal addresses >4GB. * * We catch this in the page fault handler because these addresses * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; } /* Pentium F0 0F C7 C8 bug workaround: */ static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && idt_is_f00f_address(address)) { handle_invalid_op(regs); return 1; } #endif return 0; } static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) { u32 offset = (index >> 3) * sizeof(struct desc_struct); unsigned long addr; struct ldttss_desc desc; if (index == 0) { pr_alert("%s: NULL\n", name); return; } if (offset + sizeof(struct ldttss_desc) >= gdt->size) { pr_alert("%s: 0x%hx -- out of bounds\n", name, index); return; } if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset), sizeof(struct ldttss_desc))) { pr_alert("%s: 0x%hx -- GDT entry is not readable\n", name, index); return; } addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24); #ifdef CONFIG_X86_64 addr |= ((u64)desc.base3 << 32); #endif pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (!oops_may_print()) return; if (error_code & X86_PF_INSTR) { unsigned int level; bool nx, rw; pgd_t *pgd; pte_t *pte; pgd = __va(read_cr3_pa()); pgd += pgd_index(address); pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw); if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx)) pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); if (pte && pte_present(*pte) && pte_exec(*pte) && !nx && (pgd_flags(*pgd) & _PAGE_USER) && (__read_cr4() & X86_CR4_SMEP)) pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); } if (address < PAGE_SIZE && !user_mode(regs)) pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", (void *)address); else pr_alert("BUG: unable to handle page fault for address: %px\n", (void *)address); pr_alert("#PF: %s %s in %s mode\n", (error_code & X86_PF_USER) ? "user" : "supervisor", (error_code & X86_PF_INSTR) ? "instruction fetch" : (error_code & X86_PF_WRITE) ? "write access" : "read access", user_mode(regs) ? "user" : "kernel"); pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, !(error_code & X86_PF_PROT) ? "not-present page" : (error_code & X86_PF_RSVD) ? "reserved bit violation" : (error_code & X86_PF_PK) ? "protection keys violation" : (error_code & X86_PF_RMP) ? "RMP violation" : "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; /* * This can happen for quite a few reasons. The more obvious * ones are faults accessing the GDT, or LDT. Perhaps * surprisingly, if the CPU tries to deliver a benign or * contributory exception from user code and gets a page fault * during delivery, the page fault can be delivered as though * it originated directly from user code. This could happen * due to wrong permissions on the IDT, GDT, LDT, TSS, or * kernel or IST stack. */ store_idt(&idt); /* Usable even on Xen PV -- it's just slow. */ native_store_gdt(&gdt); pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", idt.address, idt.size, gdt.address, gdt.size); store_ldt(ldtr); show_ldttss(&gdt, "LDTR", ldtr); store_tr(tr); show_ldttss(&gdt, "TR", tr); } dump_pagetable(address); if (error_code & X86_PF_RMP) snp_dump_hva_rmpentry(address); } static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk; unsigned long flags; int sig; flags = oops_begin(); tsk = current; sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); if (__die("Bad pagetable", regs, error_code)) sig = 0; oops_end(flags, regs, sig); } static void sanitize_error_code(unsigned long address, unsigned long *error_code) { /* * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to * kernel addresses are always protection faults. * * NB: This means that failed vsyscalls with vsyscall=none * will have the PROT bit. This doesn't leak any * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) *error_code |= X86_PF_PROT; } static void set_signal_archinfo(unsigned long address, unsigned long error_code) { struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | X86_PF_USER; tsk->thread.cr2 = address; } static noinline void page_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_VMAP_STACK struct stack_info info; #endif unsigned long flags; int sig; if (user_mode(regs)) { /* * Implicit kernel access from user mode? Skip the stack * overflow and EFI special cases. */ goto oops; } #ifdef CONFIG_VMAP_STACK /* * Stack overflow? During boot, we can fault near the initial * stack in the direct map, but that's not an overflow -- check * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && get_stack_guard_info((void *)address, &info)) { /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but * double-fault even before we get this far, in which case * we're fine: the double-fault handler will deal with it. * * We don't want to make it all the way into the oops code * and then double-fault, though, because we're likely to * break the console driver and lose most of the stack dump. */ call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*), handle_stack_overflow, ASM_CALL_ARG3, , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info)); BUG(); } #endif /* * Buggy firmware could access regions which might page fault. If * this happens, EFI has a special OOPS path that will try to * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) efi_crash_gracefully_on_page_fault(address); /* Only not-present faults should be handled by KFENCE. */ if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) return; oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ flags = oops_begin(); show_fault_oops(regs, error_code, address); if (task_stack_end_corrupted(current)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } static noinline void kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code, u32 pkey) { WARN_ON_ONCE(user_mode(regs)); /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) return; /* * AMD erratum #91 manifests as a spurious page fault on a PREFETCH * instruction. */ if (is_prefetch(regs, error_code, address)) return; page_fault_oops(regs, error_code, address); } /* * Print out info about fatal segfaults, if the show_unhandled_signals * sysctl is set: */ static inline void show_signal_msg(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk) { const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; /* This is a racy snapshot, but it's better than nothing. */ int cpu = raw_smp_processor_id(); if (!unhandled_signal(tsk, SIGSEGV)) return; if (!printk_ratelimit()) return; printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", loglvl, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); /* * Dump the likely CPU where the fatal segfault happened. * This can help identify faulty hardware. */ printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu, topology_core_id(cpu), topology_physical_package_id(cpu)); printk(KERN_CONT "\n"); show_opcodes(regs, loglvl); } static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) { struct task_struct *tsk = current; if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, si_code, pkey); return; } if (!(error_code & X86_PF_USER)) { /* Implicit user access to kernel memory -- just oops */ page_fault_oops(regs, error_code, address); return; } /* * User mode accesses just cause a SIGSEGV. * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came * from user space: */ if (is_prefetch(regs, error_code, address)) return; if (is_errata100(regs, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); else force_sig_fault(SIGSEGV, si_code, (void __user *)address); local_irq_disable(); } static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); } static void __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct mm_struct *mm, struct vm_area_struct *vma, u32 pkey, int si_code) { /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ if (mm) mmap_read_unlock(mm); else vma_end_read(vma); __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } static inline bool bad_area_access_from_pkeys(unsigned long error_code, struct vm_area_struct *vma) { /* This code is always called on the current mm */ bool foreign = false; if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return false; if (error_code & X86_PF_PK) return true; /* this checks permission keys on the VMA: */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return true; return false; } static noinline void bad_area_access_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct mm_struct *mm, struct vm_area_struct *vma) { /* * This OSPKE check is not strictly necessary at runtime. * But, doing it this way allows compiler optimizations * if pkeys are compiled out. */ if (bad_area_access_from_pkeys(error_code, vma)) { /* * A protection key fault means that the PKRU value did not allow * access to some PTE. Userspace can figure out what PKRU was * from the XSAVE state. This function captures the pkey from * the vma and passes it to userspace so userspace can discover * which protection key was set on the PTE. * * If we get here, we know that the hardware signaled a X86_PF_PK * fault and that there was a VMA once we got in the fault * handler. It does *not* guarantee that the VMA we find here * was the one that we faulted on. * * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); * 2. T1 : set PKRU to deny access to pkey=4, touches page * 3. T1 : faults... * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); * 5. T1 : enters fault handler, takes mmap_lock, etc... * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ u32 pkey = vma_pkey(vma); __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR); } else { __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR); } } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { struct task_struct *tsk = current; unsigned lsb = 0; pr_err( "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); return; } #endif force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; return 1; } /* * Handle a spurious fault caused by a stale TLB entry. * * This allows us to lazily refresh the TLB when increasing the * permissions of a kernel page (RO -> RW or NX -> X). Doing it * eagerly is very expensive since that implies doing a full * cross-processor TLB flush, even if no stale TLB entries exist * on other processors. * * Spurious faults may only occur if the TLB contains an entry with * fewer permission than the page table entry. Non-present (P = 0) * and reserved bit (R = 1) faults are never spurious. * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. * * Returns non-zero if a spurious fault was handled, zero otherwise. * * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 * (Optional Invalidation). */ static noinline int spurious_kernel_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; int ret; /* * Only writes to RO or instruction fetches from NX may cause * spurious faults. * * These could be from user or supervisor accesses but the TLB * is only lazily flushed after a kernel mapping protection * change, so user accesses are not expected to cause spurious * faults. */ if (error_code != (X86_PF_WRITE | X86_PF_PROT) && error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); if (!pgd_present(*pgd)) return 0; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) return 0; if (p4d_leaf(*p4d)) return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; if (pud_leaf(*pud)) return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_leaf(*pmd)) return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; ret = spurious_kernel_fault_check(error_code, pte); if (!ret) return 0; /* * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; static inline int access_error(unsigned long error_code, struct vm_area_struct *vma) { /* This is only called for the current mm, so: */ bool foreign = false; /* * Read or write was blocked by protection keys. This is * always an unconditional error and can never result in * a follow-up action to resolve the fault, like a COW. */ if (error_code & X86_PF_PK) return 1; /* * SGX hardware blocked the access. This usually happens * when the enclave memory contents have been destroyed, like * after a suspend/resume cycle. In any case, the kernel can't * fix the cause of the fault. Handle the fault as an access * error even in cases where no actual access violation * occurred. This allows userspace to rebuild the enclave in * response to the signal. */ if (unlikely(error_code & X86_PF_SGX)) return 1; /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a * page. */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return 1; /* * Shadow stack accesses (PF_SHSTK=1) are only permitted to * shadow stack VMAs. All other accesses result in an error. */ if (error_code & X86_PF_SHSTK) { if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK))) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } /* read, present: */ if (unlikely(error_code & X86_PF_PROT)) return 1; /* read, not present: */ if (unlikely(!vma_is_accessible(vma))) return 1; return 0; } bool fault_in_kernel_space(unsigned long address) { /* * On 64-bit systems, the vsyscall page is at an address above * TASK_SIZE_MAX, but is not considered part of the kernel * address space. */ if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) return false; return address >= TASK_SIZE_MAX; } /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that * ran in userspace or the kernel. */ static void do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { /* * Protection keys exceptions only happen on user pages. We * have no user pages in the kernel portion of the address * space, so do not expect them here. */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); #ifdef CONFIG_X86_32 /* * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * Before doing this on-demand faulting, ensure that the * fault is not any of the following: * 1. A fault on a PTE with a reserved bit set. * 2. A fault caused by a user-mode access. (Do not demand- * fault kernel memory due to user-mode accesses). * 3. A fault caused by a page-level protection violation. * (A demand fault would be on a non-present page which * would have X86_PF_PROT==0). * * This is only needed to close a race condition on x86-32 in * the vmalloc mapping/unmapping code. See the comment above * vmalloc_fault() for details. On x86-64 the race does not * exist as the vmalloc mappings don't need to be synchronized * there. */ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; } #endif if (is_f00f_bug(regs, hw_error_code, address)) return; /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Note, despite being a "bad area", there are quite a few * acceptable reasons to get here, such as erratum fixups * and handling kernel code that can fault, like get_user(). * * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, hw_error_code, address); } NOKPROBE_SYMBOL(do_kern_addr_fault); /* * Handle faults in the user portion of the address space. Nothing in here * should check X86_PF_USER without a specific justification: for almost * all purposes, we should treat a normal kernel access to user memory * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. * The one exception is AC flag handling, which is, per the x86 * architecture, special for WRUSS. */ static inline void do_user_addr_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; tsk = current; mm = tsk->mm; if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { /* * Whoops, this is kernel mode code trying to execute from * user memory. Unless this is AMD erratum #93, which * corrupts RIP such that it looks like a user address, * this is unrecoverable. Don't even try to look up the * VMA or look for extable entries. */ if (is_errata93(regs, address)) return; page_fault_oops(regs, error_code, address); return; } /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Reserved bits are never expected to be set on * entries in the user portion of the page tables. */ if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address); /* * If SMAP is on, check for invalid kernel (supervisor) access to user * pages in the user address space. The odd case here is WRUSS, * which, according to the preliminary documentation, does not respect * SMAP and will have the USER bit set so, in all cases, SMAP * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(error_code & X86_PF_USER) && !(regs->flags & X86_EFLAGS_AC))) { /* * No extable entry here. This was a kernel access to an * invalid pointer. get_kernel_nofault() will not get here. */ page_fault_oops(regs, error_code, address); return; } /* * If we're in an interrupt, have no user context or are running * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } /* Legacy check - remove this after verifying that it doesn't trigger */ if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { bad_area_nosemaphore(regs, error_code, address); return; } local_irq_enable(); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * Read-only permissions can not be expressed in shadow stack PTEs. * Treat all shadow stack accesses as WRITE faults. This ensures * that the MM will prepare everything (e.g., break COW) such that * maybe_mkwrite() can create a proper shadow stack PTE. */ if (error_code & X86_PF_SHSTK) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* * We set FAULT_FLAG_USER based on the register state, not * based on X86_PF_USER. User space accesses that cause * system page faults are still user accesses. */ if (user_mode(regs)) flags |= FAULT_FLAG_USER; #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The * vsyscall page is at a high address (>PAGE_OFFSET), but is * considered to be part of the user address space. * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. * * PKRU never rejects instruction fetches, so we don't need * to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { if (emulate_vsyscall(error_code, regs, address)) return; } #endif if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); if (!vma) goto lock_mmap; if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, NULL, vma); count_vm_vma_lock_event(VMA_LOCK_SUCCESS); return; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); if (fault & VM_FAULT_MAJOR) flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } lock_mmap: retry: vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { bad_area_nosemaphore(regs, error_code, address); return; } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, mm, vma); return; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked. * * Note that handle_userfault() may also release and reacquire mmap_lock * (and not return with VM_FAULT_RETRY), when returning to userland to * repeat the page fault later with a VM_FAULT_NOPAGE retval * (potentially after handling any pending signal during the return to * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) { /* * Quick path to respond to signals. The core mm code * has unlocked the mm for us if we get here. */ if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* The fault is fully completed (including releasing mmap lock) */ if (fault & VM_FAULT_COMPLETED) return; /* * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; goto retry; } mmap_read_unlock(mm); done: if (likely(!(fault & VM_FAULT_ERROR))) return; if (fatal_signal_pending(current) && !user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, 0, 0, ARCH_DEFAULT_PKEY); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, SEGV_MAPERR, ARCH_DEFAULT_PKEY); return; } /* * We ran out of memory, call the OOM killer, and return the * userspace (which will retry the fault, or kill us if we got * oom-killed): */ pagefault_out_of_memory(); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else if (fault & VM_FAULT_SIGSEGV) bad_area_nosemaphore(regs, error_code, address); else BUG(); } } NOKPROBE_SYMBOL(do_user_addr_fault); static __always_inline void trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else trace_page_fault_kernel(address, regs, error_code); } static __always_inline void handle_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { trace_page_fault_entries(regs, error_code, address); if (unlikely(kmmio_fault(regs, address))) return; /* Was the fault on kernel-controlled part of the address space? */ if (unlikely(fault_in_kernel_space(address))) { do_kern_addr_fault(regs, error_code, address); } else { do_user_addr_fault(regs, error_code, address); /* * User address page fault handling might have reenabled * interrupts. Fixing up all potential exit points of * do_user_addr_fault() and its leaf functions is just not * doable w/o creating an unholy mess or turning the code * upside down. */ local_irq_disable(); } } DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { irqentry_state_t state; unsigned long address; address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); /* * KVM uses #PF vector to deliver 'page not present' events to guests * (asynchronous page fault mechanism). The event happens when a * userspace task is trying to access some valid (from guest's point of * view) memory which is not currently mapped by the host (e.g. the * memory is swapped out). Note, the corresponding "page ready" event * which is injected when the memory becomes available, is delivered via * an interrupt mechanism and not a #PF exception * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * * We are relying on the interrupted context being sane (valid RSP, * relevant locks not held, etc.), which is fine as long as the * interrupted context had IF=1. We are also relying on the KVM * async pf type field and CR2 being read consistently instead of * getting values from real and async page faults mixed up. * * Fingers crossed. * * The async #PF handling code takes care of idtentry handling * itself. */ if (kvm_handle_async_pf(regs, (u32)address)) return; /* * Entry handling for valid #PF from kernel mode is slightly * different: RCU is already watching and ct_irq_enter() must not * be invoked because a kernel fault on a user space address might * sleep. * * In case the fault hit a RCU idle region the conditional entry * code reenabled RCU to avoid subsequent wreckage which helps * debuggability. */ state = irqentry_enter(regs); instrumentation_begin(); handle_page_fault(regs, error_code, address); instrumentation_end(); irqentry_exit(regs, state); }
20 22 2 7 22 22 22 22 27 26 32 3 27 2 12 5 7 36 7 30 8 28 28 26 9 17 4 1 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> /* bus for j1939 remote devices * Since rtnetlink, no real bus is used. */ #include <net/sock.h> #include "j1939-priv.h" static void __j1939_ecu_release(struct kref *kref) { struct j1939_ecu *ecu = container_of(kref, struct j1939_ecu, kref); struct j1939_priv *priv = ecu->priv; list_del(&ecu->list); kfree(ecu); j1939_priv_put(priv); } void j1939_ecu_put(struct j1939_ecu *ecu) { kref_put(&ecu->kref, __j1939_ecu_release); } static void j1939_ecu_get(struct j1939_ecu *ecu) { kref_get(&ecu->kref); } static bool j1939_ecu_is_mapped_locked(struct j1939_ecu *ecu) { struct j1939_priv *priv = ecu->priv; lockdep_assert_held(&priv->lock); return j1939_ecu_find_by_addr_locked(priv, ecu->addr) == ecu; } /* ECU device interface */ /* map ECU to a bus address space */ static void j1939_ecu_map_locked(struct j1939_ecu *ecu) { struct j1939_priv *priv = ecu->priv; struct j1939_addr_ent *ent; lockdep_assert_held(&priv->lock); if (!j1939_address_is_unicast(ecu->addr)) return; ent = &priv->ents[ecu->addr]; if (ent->ecu) { netdev_warn(priv->ndev, "Trying to map already mapped ECU, addr: 0x%02x, name: 0x%016llx. Skip it.\n", ecu->addr, ecu->name); return; } j1939_ecu_get(ecu); ent->ecu = ecu; ent->nusers += ecu->nusers; } /* unmap ECU from a bus address space */ void j1939_ecu_unmap_locked(struct j1939_ecu *ecu) { struct j1939_priv *priv = ecu->priv; struct j1939_addr_ent *ent; lockdep_assert_held(&priv->lock); if (!j1939_address_is_unicast(ecu->addr)) return; if (!j1939_ecu_is_mapped_locked(ecu)) return; ent = &priv->ents[ecu->addr]; ent->ecu = NULL; ent->nusers -= ecu->nusers; j1939_ecu_put(ecu); } void j1939_ecu_unmap(struct j1939_ecu *ecu) { write_lock_bh(&ecu->priv->lock); j1939_ecu_unmap_locked(ecu); write_unlock_bh(&ecu->priv->lock); } void j1939_ecu_unmap_all(struct j1939_priv *priv) { int i; write_lock_bh(&priv->lock); for (i = 0; i < ARRAY_SIZE(priv->ents); i++) if (priv->ents[i].ecu) j1939_ecu_unmap_locked(priv->ents[i].ecu); write_unlock_bh(&priv->lock); } void j1939_ecu_timer_start(struct j1939_ecu *ecu) { /* The ECU is held here and released in the * j1939_ecu_timer_handler() or j1939_ecu_timer_cancel(). */ j1939_ecu_get(ecu); /* Schedule timer in 250 msec to commit address change. */ hrtimer_start(&ecu->ac_timer, ms_to_ktime(250), HRTIMER_MODE_REL_SOFT); } void j1939_ecu_timer_cancel(struct j1939_ecu *ecu) { if (hrtimer_cancel(&ecu->ac_timer)) j1939_ecu_put(ecu); } static enum hrtimer_restart j1939_ecu_timer_handler(struct hrtimer *hrtimer) { struct j1939_ecu *ecu = container_of(hrtimer, struct j1939_ecu, ac_timer); struct j1939_priv *priv = ecu->priv; write_lock_bh(&priv->lock); /* TODO: can we test if ecu->addr is unicast before starting * the timer? */ j1939_ecu_map_locked(ecu); /* The corresponding j1939_ecu_get() is in * j1939_ecu_timer_start(). */ j1939_ecu_put(ecu); write_unlock_bh(&priv->lock); return HRTIMER_NORESTART; } struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; lockdep_assert_held(&priv->lock); ecu = kzalloc(sizeof(*ecu), gfp_any()); if (!ecu) return ERR_PTR(-ENOMEM); kref_init(&ecu->kref); ecu->addr = J1939_IDLE_ADDR; ecu->name = name; hrtimer_setup(&ecu->ac_timer, j1939_ecu_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); INIT_LIST_HEAD(&ecu->list); j1939_priv_get(priv); ecu->priv = priv; list_add_tail(&ecu->list, &priv->ecus); return ecu; } struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv, u8 addr) { lockdep_assert_held(&priv->lock); return priv->ents[addr].ecu; } struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv, u8 addr) { struct j1939_ecu *ecu; lockdep_assert_held(&priv->lock); if (!j1939_address_is_unicast(addr)) return NULL; ecu = j1939_ecu_find_by_addr_locked(priv, addr); if (ecu) j1939_ecu_get(ecu); return ecu; } struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr) { struct j1939_ecu *ecu; read_lock_bh(&priv->lock); ecu = j1939_ecu_get_by_addr_locked(priv, addr); read_unlock_bh(&priv->lock); return ecu; } /* get pointer to ecu without increasing ref counter */ static struct j1939_ecu *j1939_ecu_find_by_name_locked(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; lockdep_assert_held(&priv->lock); list_for_each_entry(ecu, &priv->ecus, list) { if (ecu->name == name) return ecu; } return NULL; } struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; lockdep_assert_held(&priv->lock); if (!name) return NULL; ecu = j1939_ecu_find_by_name_locked(priv, name); if (ecu) j1939_ecu_get(ecu); return ecu; } struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; read_lock_bh(&priv->lock); ecu = j1939_ecu_get_by_name_locked(priv, name); read_unlock_bh(&priv->lock); return ecu; } u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name) { struct j1939_ecu *ecu; int addr = J1939_IDLE_ADDR; if (!name) return J1939_NO_ADDR; read_lock_bh(&priv->lock); ecu = j1939_ecu_find_by_name_locked(priv, name); if (ecu && j1939_ecu_is_mapped_locked(ecu)) /* ecu's SA is registered */ addr = ecu->addr; read_unlock_bh(&priv->lock); return addr; } /* TX addr/name accounting * Transport protocol needs to know if a SA is local or not * These functions originate from userspace manipulating sockets, * so locking is straigforward */ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa) { struct j1939_ecu *ecu; int err = 0; write_lock_bh(&priv->lock); if (j1939_address_is_unicast(sa)) priv->ents[sa].nusers++; if (!name) goto done; ecu = j1939_ecu_get_by_name_locked(priv, name); if (!ecu) ecu = j1939_ecu_create_locked(priv, name); err = PTR_ERR_OR_ZERO(ecu); if (err) goto done; ecu->nusers++; /* TODO: do we care if ecu->addr != sa? */ if (j1939_ecu_is_mapped_locked(ecu)) /* ecu's sa is active already */ priv->ents[ecu->addr].nusers++; done: write_unlock_bh(&priv->lock); return err; } void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa) { struct j1939_ecu *ecu; write_lock_bh(&priv->lock); if (j1939_address_is_unicast(sa)) priv->ents[sa].nusers--; if (!name) goto done; ecu = j1939_ecu_find_by_name_locked(priv, name); if (WARN_ON_ONCE(!ecu)) goto done; ecu->nusers--; /* TODO: do we care if ecu->addr != sa? */ if (j1939_ecu_is_mapped_locked(ecu)) /* ecu's sa is active already */ priv->ents[ecu->addr].nusers--; j1939_ecu_put(ecu); done: write_unlock_bh(&priv->lock); }
443 443 2 2 452 452 444 445 6 2 2 4 4 4 4 2 2 2 4 374 458 147 6 12 135 17 1 16 12 13 13 13 17 17 277 17 15 274 60 139 139 139 137 139 138 139 135 3 4 4 4 3 1 1 147 6 147 6 3 1 2 1 1 1 1 12 2 68 67 53 261 218 170 1 459 457 6 446 134 261 45 150 37 227 227 227 226 227 226 226 226 238 2 227 227 7 1 9 6 6 9 2 6 6 6 1 6 3 1 1 3 1 7 4 3 3 1 1 1 1 11 8 3 3 10 7 7 15 3 9 3 4 9 4 3 2 13 6 6 13 1 12 2 11 1 2 9 10 6 6 3 8 12 1 12 12 13 18 2 14 142 144 144 143 95 68 1 1 154 156 49 45 131 9 2 3 7 152 5 8 6 2 1 3 3 8 4 2 2 4 3 1 2 1 1 5 3 1 1 59 1 1 57 13 2 11 1 5 1 1 1 1 2 15 492 492 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module for IP set management */ #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/rculist.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/ipset/ip_set.h> static LIST_HEAD(ip_set_type_list); /* all registered set types */ static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ struct ip_set_net { struct ip_set * __rcu *ip_set_list; /* all individual sets */ ip_set_id_t ip_set_max; /* max number of sets */ bool is_deleted; /* deleted by ip_set_net_exit */ bool is_destroyed; /* all sets are destroyed */ }; static unsigned int ip_set_net_id __read_mostly; static struct ip_set_net *ip_set_pernet(struct net *net) { return net_generic(net, ip_set_net_id); } #define IP_SET_INC 64 #define STRNCMP(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) static unsigned int max_sets; module_param(max_sets, int, 0600); MODULE_PARM_DESC(max_sets, "maximal number of sets"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex or ip_set_ref_lock is held: */ #define ip_set_dereference(inst) \ rcu_dereference_protected((inst)->ip_set_list, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ lockdep_is_held(&ip_set_ref_lock) || \ (inst)->is_deleted) #define ip_set(inst, id) \ ip_set_dereference(inst)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] #define ip_set_dereference_nfnl(p) \ rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is * serialized by ip_set_type_mutex. */ static void ip_set_type_lock(void) { mutex_lock(&ip_set_type_mutex); } static void ip_set_type_unlock(void) { mutex_unlock(&ip_set_type_mutex); } /* Register and deregister settype */ static struct ip_set_type * find_set_type(const char *name, u8 family, u8 revision) { struct ip_set_type *type; list_for_each_entry_rcu(type, &ip_set_type_list, list, lockdep_is_held(&ip_set_type_mutex)) if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC) && revision >= type->revision_min && revision <= type->revision_max) return type; return NULL; } /* Unlock, try to load a set type module and lock again */ static bool load_settype(const char *name) { if (!try_module_get(THIS_MODULE)) return false; nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { pr_warn("Can't find ip_set type %s\n", name); nfnl_lock(NFNL_SUBSYS_IPSET); module_put(THIS_MODULE); return false; } nfnl_lock(NFNL_SUBSYS_IPSET); module_put(THIS_MODULE); return true; } /* Find a set type and reference it */ #define find_set_type_get(name, family, revision, found) \ __find_set_type_get(name, family, revision, found, false) static int __find_set_type_get(const char *name, u8 family, u8 revision, struct ip_set_type **found, bool retry) { struct ip_set_type *type; int err; if (retry && !load_settype(name)) return -IPSET_ERR_FIND_TYPE; rcu_read_lock(); *found = find_set_type(name, family, revision); if (*found) { err = !try_module_get((*found)->me) ? -EFAULT : 0; goto unlock; } /* Make sure the type is already loaded * but we don't support the revision */ list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STRNCMP(type->name, name)) { err = -IPSET_ERR_FIND_TYPE; goto unlock; } rcu_read_unlock(); return retry ? -IPSET_ERR_FIND_TYPE : __find_set_type_get(name, family, revision, found, true); unlock: rcu_read_unlock(); return err; } /* Find a given set type by name and family. * If we succeeded, the supported minimal and maximum revisions are * filled out. */ #define find_set_type_minmax(name, family, min, max) \ __find_set_type_minmax(name, family, min, max, false) static int __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, bool retry) { struct ip_set_type *type; bool found = false; if (retry && !load_settype(name)) return -IPSET_ERR_FIND_TYPE; *min = 255; *max = 0; rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC)) { found = true; if (type->revision_min < *min) *min = type->revision_min; if (type->revision_max > *max) *max = type->revision_max; } rcu_read_unlock(); if (found) return 0; return retry ? -IPSET_ERR_FIND_TYPE : __find_set_type_minmax(name, family, min, max, true); } #define family_name(f) ((f) == NFPROTO_IPV4 ? "inet" : \ (f) == NFPROTO_IPV6 ? "inet6" : "any") /* Register a set type structure. The type is identified by * the unique triple of name, family and revision. */ int ip_set_type_register(struct ip_set_type *type) { int ret = 0; if (type->protocol != IPSET_PROTOCOL) { pr_warn("ip_set type %s, family %s, revision %u:%u uses wrong protocol version %u (want %u)\n", type->name, family_name(type->family), type->revision_min, type->revision_max, type->protocol, IPSET_PROTOCOL); return -EINVAL; } ip_set_type_lock(); if (find_set_type(type->name, type->family, type->revision_min)) { /* Duplicate! */ pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", type->name, family_name(type->family), type->revision_min); ip_set_type_unlock(); return -EINVAL; } list_add_rcu(&type->list, &ip_set_type_list); pr_debug("type %s, family %s, revision %u:%u registered.\n", type->name, family_name(type->family), type->revision_min, type->revision_max); ip_set_type_unlock(); return ret; } EXPORT_SYMBOL_GPL(ip_set_type_register); /* Unregister a set type. There's a small race with ip_set_create */ void ip_set_type_unregister(struct ip_set_type *type) { ip_set_type_lock(); if (!find_set_type(type->name, type->family, type->revision_min)) { pr_warn("ip_set type %s, family %s with revision min %u not registered\n", type->name, family_name(type->family), type->revision_min); ip_set_type_unlock(); return; } list_del_rcu(&type->list); pr_debug("type %s, family %s with revision min %u unregistered.\n", type->name, family_name(type->family), type->revision_min); ip_set_type_unlock(); synchronize_rcu(); } EXPORT_SYMBOL_GPL(ip_set_type_unregister); /* Utility functions */ void * ip_set_alloc(size_t size) { return kvzalloc(size, GFP_KERNEL_ACCOUNT); } EXPORT_SYMBOL_GPL(ip_set_alloc); void ip_set_free(void *members) { pr_debug("%p: free with %s\n", members, is_vmalloc_addr(members) ? "vfree" : "kfree"); kvfree(members); } EXPORT_SYMBOL_GPL(ip_set_free); static bool flag_nested(const struct nlattr *nla) { return nla->nla_type & NLA_F_NESTED; } static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 }, [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), }; int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) { struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) return -IPSET_ERR_PROTOCOL; *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) { struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) return -IPSET_ERR_PROTOCOL; memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), sizeof(struct in6_addr)); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); static u32 ip_set_timeout_get(const unsigned long *timeout) { u32 t; if (*timeout == IPSET_ELEM_PERMANENT) return 0; t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC; /* Zero value in userspace means no timeout */ return t == 0 ? 1 : t; } static char * ip_set_comment_uget(struct nlattr *tb) { return nla_data(tb); } /* Called from uadd only, protected by the set spinlock. * The kadt functions don't use the comment extensions in any way. */ void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, const struct ip_set_ext *ext) { struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); size_t len = ext->comment ? strlen(ext->comment) : 0; if (unlikely(c)) { set->ext_size -= sizeof(*c) + strlen(c->str) + 1; kfree_rcu(c, rcu); rcu_assign_pointer(comment->c, NULL); } if (!len) return; if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) len = IPSET_MAX_COMMENT_SIZE; c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); if (unlikely(!c)) return; strscpy(c->str, ext->comment, len + 1); set->ext_size += sizeof(*c) + strlen(c->str) + 1; rcu_assign_pointer(comment->c, c); } EXPORT_SYMBOL_GPL(ip_set_init_comment); /* Used only when dumping a set, protected by rcu_read_lock() */ static int ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) { struct ip_set_comment_rcu *c = rcu_dereference(comment->c); if (!c) return 0; return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); } /* Called from uadd/udel, flush or the garbage collectors protected * by the set spinlock. * Called when the set is destroyed and when there can't be any user * of the set data anymore. */ static void ip_set_comment_free(struct ip_set *set, void *ptr) { struct ip_set_comment *comment = ptr; struct ip_set_comment_rcu *c; c = rcu_dereference_protected(comment->c, 1); if (unlikely(!c)) return; set->ext_size -= sizeof(*c) + strlen(c->str) + 1; kfree_rcu(c, rcu); rcu_assign_pointer(comment->c, NULL); } typedef void (*destroyer)(struct ip_set *, void *); /* ipset data extension types, in size order */ const struct ip_set_ext_type ip_set_extensions[] = { [IPSET_EXT_ID_COUNTER] = { .type = IPSET_EXT_COUNTER, .flag = IPSET_FLAG_WITH_COUNTERS, .len = sizeof(struct ip_set_counter), .align = __alignof__(struct ip_set_counter), }, [IPSET_EXT_ID_TIMEOUT] = { .type = IPSET_EXT_TIMEOUT, .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, [IPSET_EXT_ID_SKBINFO] = { .type = IPSET_EXT_SKBINFO, .flag = IPSET_FLAG_WITH_SKBINFO, .len = sizeof(struct ip_set_skbinfo), .align = __alignof__(struct ip_set_skbinfo), }, [IPSET_EXT_ID_COMMENT] = { .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, .flag = IPSET_FLAG_WITH_COMMENT, .len = sizeof(struct ip_set_comment), .align = __alignof__(struct ip_set_comment), .destroy = ip_set_comment_free, }, }; EXPORT_SYMBOL_GPL(ip_set_extensions); static bool add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) { return ip_set_extensions[id].flag ? (flags & ip_set_extensions[id].flag) : !!tb[IPSET_ATTR_TIMEOUT]; } size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, size_t align) { enum ip_set_ext_id id; u32 cadt_flags = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) set->flags |= IPSET_CREATE_FLAG_FORCEADD; if (!align) align = 1; for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; if (align < ip_set_extensions[id].align) align = ip_set_extensions[id].align; len = ALIGN(len, ip_set_extensions[id].align); set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; len += ip_set_extensions[id].len; } return ALIGN(len, align); } EXPORT_SYMBOL_GPL(ip_set_elem_len); int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { u64 fullmark; if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_TIMEOUT]) { if (!SET_WITH_TIMEOUT(set)) return -IPSET_ERR_TIMEOUT; ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { if (!SET_WITH_COUNTER(set)) return -IPSET_ERR_COUNTER; if (tb[IPSET_ATTR_BYTES]) ext->bytes = be64_to_cpu(nla_get_be64( tb[IPSET_ATTR_BYTES])); if (tb[IPSET_ATTR_PACKETS]) ext->packets = be64_to_cpu(nla_get_be64( tb[IPSET_ATTR_PACKETS])); } if (tb[IPSET_ATTR_COMMENT]) { if (!SET_WITH_COMMENT(set)) return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } if (tb[IPSET_ATTR_SKBMARK]) { if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); ext->skbinfo.skbmark = fullmark >> 32; ext->skbinfo.skbmarkmask = fullmark & 0xffffffff; } if (tb[IPSET_ATTR_SKBPRIO]) { if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbinfo.skbprio = be32_to_cpu(nla_get_be32(tb[IPSET_ATTR_SKBPRIO])); } if (tb[IPSET_ATTR_SKBQUEUE]) { if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbinfo.skbqueue = be16_to_cpu(nla_get_be16(tb[IPSET_ATTR_SKBQUEUE])); } return 0; } EXPORT_SYMBOL_GPL(ip_set_get_extensions); static u64 ip_set_get_bytes(const struct ip_set_counter *counter) { return (u64)atomic64_read(&(counter)->bytes); } static u64 ip_set_get_packets(const struct ip_set_counter *counter) { return (u64)atomic64_read(&(counter)->packets); } static bool ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) { return nla_put_net64(skb, IPSET_ATTR_BYTES, cpu_to_be64(ip_set_get_bytes(counter)), IPSET_ATTR_PAD) || nla_put_net64(skb, IPSET_ATTR_PACKETS, cpu_to_be64(ip_set_get_packets(counter)), IPSET_ATTR_PAD); } static bool ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) { /* Send nonzero parameters only */ return ((skbinfo->skbmark || skbinfo->skbmarkmask) && nla_put_net64(skb, IPSET_ATTR_SKBMARK, cpu_to_be64((u64)skbinfo->skbmark << 32 | skbinfo->skbmarkmask), IPSET_ATTR_PAD)) || (skbinfo->skbprio && nla_put_net32(skb, IPSET_ATTR_SKBPRIO, cpu_to_be32(skbinfo->skbprio))) || (skbinfo->skbqueue && nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, cpu_to_be16(skbinfo->skbqueue))); } int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active) { if (SET_WITH_TIMEOUT(set)) { unsigned long *timeout = ext_timeout(e, set); if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(active ? ip_set_timeout_get(timeout) : *timeout))) return -EMSGSIZE; } if (SET_WITH_COUNTER(set) && ip_set_put_counter(skb, ext_counter(e, set))) return -EMSGSIZE; if (SET_WITH_COMMENT(set) && ip_set_put_comment(skb, ext_comment(e, set))) return -EMSGSIZE; if (SET_WITH_SKBINFO(set) && ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) return -EMSGSIZE; return 0; } EXPORT_SYMBOL_GPL(ip_set_put_extensions); static bool ip_set_match_counter(u64 counter, u64 match, u8 op) { switch (op) { case IPSET_COUNTER_NONE: return true; case IPSET_COUNTER_EQ: return counter == match; case IPSET_COUNTER_NE: return counter != match; case IPSET_COUNTER_LT: return counter < match; case IPSET_COUNTER_GT: return counter > match; } return false; } static void ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) { atomic64_add((long long)bytes, &(counter)->bytes); } static void ip_set_add_packets(u64 packets, struct ip_set_counter *counter) { atomic64_add((long long)packets, &(counter)->packets); } static void ip_set_update_counter(struct ip_set_counter *counter, const struct ip_set_ext *ext, u32 flags) { if (ext->packets != ULLONG_MAX && !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { ip_set_add_bytes(ext->bytes, counter); ip_set_add_packets(ext->packets, counter); } } static void ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { mext->skbinfo = *skbinfo; } bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set))) return false; if (SET_WITH_COUNTER(set)) { struct ip_set_counter *counter = ext_counter(data, set); ip_set_update_counter(counter, ext, flags); if (flags & IPSET_FLAG_MATCH_COUNTERS && !(ip_set_match_counter(ip_set_get_packets(counter), mext->packets, mext->packets_op) && ip_set_match_counter(ip_set_get_bytes(counter), mext->bytes, mext->bytes_op))) return false; } if (SET_WITH_SKBINFO(set)) ip_set_get_skbinfo(ext_skbinfo(data, set), ext, mext, flags); return true; } EXPORT_SYMBOL_GPL(ip_set_match_extensions); /* Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace * only and serialized by the nfnl mutex indirectly from nfnetlink. * * Sets are identified by their index in ip_set_list and the index * is used by the external references (set/SET netfilter modules). * * The set behind an index may change by swapping only, from userspace. */ static void __ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); set->ref++; write_unlock_bh(&ip_set_ref_lock); } static void __ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); BUG_ON(set->ref == 0); set->ref--; write_unlock_bh(&ip_set_ref_lock); } /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ static void __ip_set_get_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); set->ref_netlink++; write_unlock_bh(&ip_set_ref_lock); } static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); BUG_ON(set->ref_netlink == 0); set->ref_netlink--; write_unlock_bh(&ip_set_ref_lock); } /* Add, del and test set entries from kernel. * * The set behind the index must exist and must be referenced * so it can't be destroyed (or changed) under our foot. */ static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { struct ip_set_net *inst = ip_set_pernet(net); /* ip_set_list and the set pointer need to be protected */ return ip_set_dereference_nfnl(inst->ip_set_list)[index]; } static inline void ip_set_lock(struct ip_set *set) { if (!set->variant->region_lock) spin_lock_bh(&set->lock); } static inline void ip_set_unlock(struct ip_set *set) { if (!set->variant->region_lock) spin_unlock_bh(&set->lock); } int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(xt_net(par), index); int ret = 0; BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); ip_set_lock(set); set->variant->kadt(set, skb, par, IPSET_ADD, opt); ip_set_unlock(set); ret = 1; } else { /* --return-nomatch: invert matched element */ if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) && (set->type->features & IPSET_TYPE_NOMATCH) && (ret > 0 || ret == -ENOTEMPTY)) ret = -ret; } /* Convert error codes to nomatch */ return (ret < 0 ? 0 : ret); } EXPORT_SYMBOL_GPL(ip_set_test); int ip_set_add(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(xt_net(par), index); int ret; BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); ip_set_unlock(set); return ret; } EXPORT_SYMBOL_GPL(ip_set_add); int ip_set_del(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(xt_net(par), index); int ret = 0; BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); ip_set_unlock(set); return ret; } EXPORT_SYMBOL_GPL(ip_set_del); /* Find set by name, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * */ ip_set_id_t ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) { ip_set_id_t i, index = IPSET_INVALID_ID; struct ip_set *s; struct ip_set_net *inst = ip_set_pernet(net); rcu_read_lock(); for (i = 0; i < inst->ip_set_max; i++) { s = rcu_dereference(inst->ip_set_list)[i]; if (s && STRNCMP(s->name, name)) { __ip_set_get(s); index = i; *set = s; break; } } rcu_read_unlock(); return index; } EXPORT_SYMBOL_GPL(ip_set_get_byname); /* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * */ static void __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) { struct ip_set *set; rcu_read_lock(); set = rcu_dereference(inst->ip_set_list)[index]; if (set) __ip_set_put(set); rcu_read_unlock(); } void ip_set_put_byindex(struct net *net, ip_set_id_t index) { struct ip_set_net *inst = ip_set_pernet(net); __ip_set_put_byindex(inst, index); } EXPORT_SYMBOL_GPL(ip_set_put_byindex); /* Get the name of a set behind a set index. * Set itself is protected by RCU, but its name isn't: to protect against * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the * name. */ void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) { struct ip_set *set = ip_set_rcu_get(net, index); BUG_ON(!set); read_lock_bh(&ip_set_ref_lock); strscpy_pad(name, set->name, IPSET_MAXNAMELEN); read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); /* Routines to call by external subsystems, which do not * call nfnl_lock for us. */ /* Find set by index, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * * The nfnl mutex is used in the function. */ ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) { struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); if (index >= inst->ip_set_max) return IPSET_INVALID_ID; nfnl_lock(NFNL_SUBSYS_IPSET); set = ip_set(inst, index); if (set) __ip_set_get(set); else index = IPSET_INVALID_ID; nfnl_unlock(NFNL_SUBSYS_IPSET); return index; } EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); /* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * * The nfnl mutex is used in the function. */ void ip_set_nfnl_put(struct net *net, ip_set_id_t index) { struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); nfnl_lock(NFNL_SUBSYS_IPSET); if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ set = ip_set(inst, index); if (set) __ip_set_put(set); } nfnl_unlock(NFNL_SUBSYS_IPSET); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); /* Communication protocol with userspace over netlink. * * The commands are serialized by the nfnl mutex. */ static inline u8 protocol(const struct nlattr * const tb[]) { return nla_get_u8(tb[IPSET_ATTR_PROTOCOL]); } static inline bool protocol_failed(const struct nlattr * const tb[]) { return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) != IPSET_PROTOCOL; } static inline bool protocol_min_failed(const struct nlattr * const tb[]) { return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) < IPSET_PROTOCOL_MIN; } static inline u32 flag_exist(const struct nlmsghdr *nlh) { return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST; } static struct nlmsghdr * start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, enum ipset_cmd cmd) { return nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags, NFPROTO_IPV4, NFNETLINK_V0, 0); } /* Create a set */ static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1}, [IPSET_ATTR_REVISION] = { .type = NLA_U8 }, [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, }; static struct ip_set * find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) { struct ip_set *set = NULL; ip_set_id_t i; *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); if (set && STRNCMP(set->name, name)) { *id = i; break; } } return (*id == IPSET_INVALID_ID ? NULL : set); } static inline struct ip_set * find_set(struct ip_set_net *inst, const char *name) { ip_set_id_t id; return find_set_and_id(inst, name, &id); } static int find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, struct ip_set **set) { struct ip_set *s; ip_set_id_t i; *index = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (!s) { if (*index == IPSET_INVALID_ID) *index = i; } else if (STRNCMP(name, s->name)) { /* Name clash */ *set = s; return -EEXIST; } } if (*index == IPSET_INVALID_ID) /* No free slot remained */ return -IPSET_ERR_MAX_SETS; return 0; } static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { return -EOPNOTSUPP; } static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; u32 flags = flag_exist(info->nlh); int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_TYPENAME] || !attr[IPSET_ATTR_REVISION] || !attr[IPSET_ATTR_FAMILY] || (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])))) return -IPSET_ERR_PROTOCOL; name = nla_data(attr[IPSET_ATTR_SETNAME]); typename = nla_data(attr[IPSET_ATTR_TYPENAME]); family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); revision = nla_get_u8(attr[IPSET_ATTR_REVISION]); pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", name, typename, family_name(family), revision); /* First, and without any locks, allocate and initialize * a normal base set structure. */ set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; spin_lock_init(&set->lock); strscpy(set->name, name, IPSET_MAXNAMELEN); set->family = family; set->revision = revision; /* Next, check that we know the type, and take * a reference on the type, to make sure it stays available * while constructing our new set. * * After referencing the type, we try to create the type * specific part of the set without holding any locks. */ ret = find_set_type_get(typename, family, revision, &set->type); if (ret) goto out; /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy, NULL)) { ret = -IPSET_ERR_PROTOCOL; goto put_out; } /* Set create flags depending on the type revision */ set->flags |= set->type->create_flags[revision]; ret = set->type->create(info->net, set, tb, flags); if (ret != 0) goto put_out; /* BTW, ret==0 here. */ /* Here, we have a valid, constructed set and we are protected * by the nfnl mutex. Find the first free index in ip_set_list * and check clashing. */ ret = find_free_id(inst, set->name, &index, &clash); if (ret == -EEXIST) { /* If this is the same set and requested, ignore error */ if ((flags & IPSET_FLAG_EXIST) && STRNCMP(set->type->name, clash->type->name) && set->type->family == clash->type->family && set->type->revision_min == clash->type->revision_min && set->type->revision_max == clash->type->revision_max && set->variant->same_set(set, clash)) ret = 0; goto cleanup; } else if (ret == -IPSET_ERR_MAX_SETS) { struct ip_set **list, **tmp; ip_set_id_t i = inst->ip_set_max + IP_SET_INC; if (i < inst->ip_set_max || i == IPSET_INVALID_ID) /* Wraparound */ goto cleanup; list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ tmp = ip_set_dereference(inst); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ synchronize_net(); /* Use new list */ index = inst->ip_set_max; inst->ip_set_max = i; kvfree(tmp); ret = 0; } else if (ret) { goto cleanup; } /* Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); ip_set(inst, index) = set; return ret; cleanup: set->variant->cancel_gc(set); set->variant->destroy(set); put_out: module_put(set->type->me); out: kfree(set); return ret; } /* Destroy sets */ static const struct nla_policy ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, }; /* In order to return quickly when destroying a single set, it is split * into two stages: * - Cancel garbage collector * - Destroy the set itself via call_rcu() */ static void ip_set_destroy_set_rcu(struct rcu_head *head) { struct ip_set *set = container_of(head, struct ip_set, rcu); set->variant->destroy(set); module_put(set->type->me); kfree(set); } static void _destroy_all_sets(struct ip_set_net *inst) { struct ip_set *set; ip_set_id_t i; bool need_wait = false; /* First cancel gc's: set:list sets are flushed as well */ for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); if (set) { set->variant->cancel_gc(set); if (set->type->features & IPSET_TYPE_NAME) need_wait = true; } } /* Must wait for flush to be really finished */ if (need_wait) rcu_barrier(); for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); if (set) { ip_set(inst, i) = NULL; set->variant->destroy(set); module_put(set->type->me); kfree(set); } } } static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; int ret = 0; if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call * ip_set_nfnl_get_* functions, that way we * can safely check references here. * * list:set timer can only decrement the reference * counter, so if it's already zero, we can proceed * without holding the lock. */ if (!attr[IPSET_ATTR_SETNAME]) { read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && (s->ref || s->ref_netlink)) { ret = -IPSET_ERR_BUSY; goto out; } } inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); _destroy_all_sets(inst); /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { u32 flags = flag_exist(info->nlh); u16 features = 0; read_lock_bh(&ip_set_ref_lock); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { if (!(flags & IPSET_FLAG_EXIST)) ret = -ENOENT; goto out; } else if (s->ref || s->ref_netlink) { ret = -IPSET_ERR_BUSY; goto out; } features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); /* Must cancel garbage collectors */ s->variant->cancel_gc(s); if (features & IPSET_TYPE_NAME) { /* Must wait for flush to be really finished */ rcu_barrier(); } call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; out: read_unlock_bh(&ip_set_ref_lock); return ret; } /* Flush sets */ static void ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); ip_set_lock(set); set->variant->flush(set); ip_set_unlock(set); } static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s) ip_set_flush_set(s); } } else { s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!s) return -ENOENT; ip_set_flush_set(s); } return 0; } /* Rename a set */ static const struct nla_policy ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, }; static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *s; const char *name2; ip_set_id_t i; int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; write_lock_bh(&ip_set_ref_lock); if (set->ref != 0 || set->ref_netlink != 0) { ret = -IPSET_ERR_REFERENCED; goto out; } name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && STRNCMP(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } } strscpy_pad(set->name, name2, IPSET_MAXNAMELEN); out: write_unlock_bh(&ip_set_ref_lock); return ret; } /* Swap two sets so that name/index points to the other. * References and set names are also swapped. * * The commands are serialized by the nfnl mutex and references are * protected by the ip_set_ref_lock. The kernel interfaces * do not hold the mutex but the pointer settings are atomic * so the ip_set_list always contains valid pointers to the sets. */ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *from, *to; ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &from_id); if (!from) return -ENOENT; to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id); if (!to) return -IPSET_ERR_EXIST_SETNAME2; /* Features must not change. * Not an artifical restriction anymore, as we must prevent * possible loops created by swapping in setlist type of sets. */ if (!(from->type->features == to->type->features && from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; write_lock_bh(&ip_set_ref_lock); if (from->ref_netlink || to->ref_netlink) { write_unlock_bh(&ip_set_ref_lock); return -EBUSY; } strscpy_pad(from_name, from->name, IPSET_MAXNAMELEN); strscpy_pad(from->name, to->name, IPSET_MAXNAMELEN); strscpy_pad(to->name, from_name, IPSET_MAXNAMELEN); swap(from->ref, to->ref); ip_set(inst, from_id) = to; ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); return 0; } /* List/save set data */ #define DUMP_INIT 0 #define DUMP_ALL 1 #define DUMP_ONE 2 #define DUMP_LAST 3 #define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) #define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) int ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) { u32 cadt_flags = 0; if (SET_WITH_TIMEOUT(set)) if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(set->timeout)))) return -EMSGSIZE; if (SET_WITH_COUNTER(set)) cadt_flags |= IPSET_FLAG_WITH_COUNTERS; if (SET_WITH_COMMENT(set)) cadt_flags |= IPSET_FLAG_WITH_COMMENT; if (SET_WITH_SKBINFO(set)) cadt_flags |= IPSET_FLAG_WITH_SKBINFO; if (SET_WITH_FORCEADD(set)) cadt_flags |= IPSET_FLAG_WITH_FORCEADD; if (!cadt_flags) return 0; return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); } EXPORT_SYMBOL_GPL(ip_set_put_flags); static int ip_set_dump_done(struct netlink_callback *cb) { if (cb->args[IPSET_CB_ARG0]) { struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; struct ip_set *set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); __ip_set_put_netlink(set); } return 0; } static inline void dump_attrs(struct nlmsghdr *nlh) { const struct nlattr *attr; int rem; pr_debug("dump nlmsg\n"); nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) { pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len); } } static const struct nla_policy ip_set_dump_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_FLAGS] = { .type = NLA_U32 }, }; static int ip_set_dump_start(struct netlink_callback *cb) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; struct sk_buff *skb = cb->skb; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type; int ret; ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, ip_set_dump_policy, NULL); if (ret) goto error; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { ip_set_id_t index; struct ip_set *set; set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); if (!set) { ret = -ENOENT; goto error; } dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; } else { dump_type = DUMP_ALL; } if (cda[IPSET_ATTR_FLAGS]) { u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); dump_type |= (f << 16); } cb->args[IPSET_CB_NET] = (unsigned long)inst; cb->args[IPSET_CB_DUMP] = dump_type; return 0; error: /* We have to create and send the error message manually :-( */ if (nlh->nlmsg_flags & NLM_F_ACK) { netlink_ack(cb->skb, nlh, ret, NULL); } return ret; } static int ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb) { ip_set_id_t index = IPSET_INVALID_ID, max; struct ip_set *set = NULL; struct nlmsghdr *nlh = NULL; unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type, dump_flags; bool is_destroyed; int ret = 0; if (!cb->args[IPSET_CB_DUMP]) return -EINVAL; if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max) goto out; dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]); dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]); max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1 : inst->ip_set_max; dump_last: pr_debug("dump type, flag: %u %u index: %ld\n", dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; write_lock_bh(&ip_set_ref_lock); set = ip_set(inst, index); is_destroyed = inst->is_destroyed; if (!set || is_destroyed) { write_unlock_bh(&ip_set_ref_lock); if (dump_type == DUMP_ONE) { ret = -ENOENT; goto out; } if (is_destroyed) { /* All sets are just being destroyed */ ret = 0; goto out; } continue; } /* When dumping all sets, we must dump "sorted" * so that lists (unions of sets) are dumped last. */ if (dump_type != DUMP_ONE && ((dump_type == DUMP_ALL) == !!(set->type->features & IPSET_DUMP_LAST))) { write_unlock_bh(&ip_set_ref_lock); continue; } pr_debug("List set: %s\n", set->name); if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); set->ref_netlink++; } write_unlock_bh(&ip_set_ref_lock); nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, IPSET_CMD_LIST); if (!nlh) { ret = -EMSGSIZE; goto release_refcount; } if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, cb->args[IPSET_CB_PROTO]) || nla_put_string(skb, IPSET_ATTR_SETNAME, set->name)) goto nla_put_failure; if (dump_flags & IPSET_FLAG_LIST_SETNAME) goto next_set; switch (cb->args[IPSET_CB_ARG0]) { case 0: /* Core header data */ if (nla_put_string(skb, IPSET_ATTR_TYPENAME, set->type->name) || nla_put_u8(skb, IPSET_ATTR_FAMILY, set->family) || nla_put_u8(skb, IPSET_ATTR_REVISION, set->revision)) goto nla_put_failure; if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN && nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index))) goto nla_put_failure; ret = set->variant->head(set, skb); if (ret < 0) goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; if (set->variant->uref) set->variant->uref(set, cb, true); fallthrough; default: ret = set->variant->list(set, skb, cb); if (!cb->args[IPSET_CB_ARG0]) /* Set is done, proceed with next one */ goto next_set; goto release_refcount; } } /* If we dump all sets, continue with dumping last ones */ if (dump_type == DUMP_ALL) { dump_type = DUMP_LAST; cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); cb->args[IPSET_CB_INDEX] = 0; if (set && set->variant->uref) set->variant->uref(set, cb, false); goto dump_last; } goto out; nla_put_failure: ret = -EFAULT; next_set: if (dump_type == DUMP_ONE) cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID; else cb->args[IPSET_CB_INDEX]++; release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); __ip_set_put_netlink(set); cb->args[IPSET_CB_ARG0] = 0; } out: if (nlh) { nlmsg_end(skb, nlh); pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len); dump_attrs(nlh); } return ret < 0 ? ret : skb->len; } static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; { struct netlink_dump_control c = { .start = ip_set_dump_start, .dump = ip_set_dump_do, .done = ip_set_dump_done, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } } /* Add, del and test */ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, [IPSET_ATTR_ADT] = { .type = NLA_NESTED }, }; static int call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 flags, bool use_lineno) { int ret; u32 lineno = 0; bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { if (retried) { __ip_set_get_netlink(set); nfnl_unlock(NFNL_SUBSYS_IPSET); cond_resched(); nfnl_lock(NFNL_SUBSYS_IPSET); __ip_set_put_netlink(set); } ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); retried = true; } while (ret == -ERANGE || (ret == -EAGAIN && set->variant->resize && (ret = set->variant->resize(set, retried)) == 0)); if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) return 0; if (lineno && use_lineno) { /* Error in restore/batch mode: send back lineno */ struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; struct nlmsgerr *errmsg; size_t payload = min(SIZE_MAX, sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *cmdattr; u32 *errline; skb2 = nlmsg_new(payload, GFP_KERNEL); if (!skb2) return -ENOMEM; rep = nlmsg_put(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = ret; unsafe_memcpy(&errmsg->msg, nlh, nlh->nlmsg_len, /* Bounds checked by the skb layer. */); cmdattr = (void *)&errmsg->msg + min_len; ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, nlh->nlmsg_len - min_len, ip_set_adt_policy, NULL); if (ret) { nlmsg_free(skb2); return ret; } errline = nla_data(cda[IPSET_ATTR_LINENO]); *errline = lineno; nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } return ret; } static int ip_set_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, enum ipset_adt adt, const struct nlmsghdr *nlh, const struct nlattr * const attr[], struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; if (attr[IPSET_ATTR_DATA]) { if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; } } return ret; } static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { return ip_set_ad(info->net, info->sk, skb, IPSET_ADD, info->nlh, attr, info->extack); } static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { return ip_set_ad(info->net, info->sk, skb, IPSET_DEL, info->nlh, attr, info->extack); } static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; u32 lineno; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_DATA] || !flag_nested(attr[IPSET_ATTR_DATA]))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0); rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) ret = 1; return ret > 0 ? 0 : -IPSET_ERR_EXIST; } /* Get headed data of a set */ static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_HEADER); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) || nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision)) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } /* Get type data */ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, }; static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; u8 family, min, max; const char *typename; int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_TYPENAME] || !attr[IPSET_ATTR_FAMILY])) return -IPSET_ERR_PROTOCOL; family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); typename = nla_data(attr[IPSET_ATTR_TYPENAME]); ret = find_set_type_minmax(typename, family, &min, &max); if (ret) return ret; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_TYPE); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) || nla_put_u8(skb2, IPSET_ATTR_REVISION, max) || nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min)) goto nla_put_failure; nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } /* Get protocol version */ static const struct nla_policy ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, }; static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_PROTOCOL); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL)) goto nla_put_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL_MIN, IPSET_PROTOCOL_MIN)) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } /* Get set by name or index, from userspace */ static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &id); if (id == IPSET_INVALID_ID) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYNAME); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || nla_put_net16(skb2, IPSET_ATTR_INDEX, htons(id))) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_INDEX] = { .type = NLA_U16 }, }; static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_INDEX])) return -IPSET_ERR_PROTOCOL; id = ip_set_get_h16(attr[IPSET_ATTR_INDEX]); if (id >= inst->ip_set_max) return -ENOENT; set = ip_set(inst, id); if (set == NULL) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYINDEX); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name)) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_NONE] = { .call = ip_set_none, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, }, [IPSET_CMD_CREATE] = { .call = ip_set_create, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_create_policy, }, [IPSET_CMD_DESTROY] = { .call = ip_set_destroy, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_FLUSH] = { .call = ip_set_flush, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_RENAME] = { .call = ip_set_rename, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_SWAP] = { .call = ip_set_swap, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_LIST] = { .call = ip_set_dump, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_ADD] = { .call = ip_set_uadd, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_DEL] = { .call = ip_set_udel, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_TEST] = { .call = ip_set_utest, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_HEADER] = { .call = ip_set_header, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_TYPE] = { .call = ip_set_type, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_type_policy, }, [IPSET_CMD_PROTOCOL] = { .call = ip_set_protocol, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_protocol_policy, }, [IPSET_CMD_GET_BYNAME] = { .call = ip_set_byname, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_GET_BYINDEX] = { .call = ip_set_byindex, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_index_policy, }, }; static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { .name = "ip_set", .subsys_id = NFNL_SUBSYS_IPSET, .cb_count = IPSET_MSG_MAX, .cb = ip_set_netlink_subsys_cb, }; /* Interface to iptables/ip6tables */ static int ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) { unsigned int *op; void *data; int copylen = *len, ret = 0; struct net *net = sock_net(sk); struct ip_set_net *inst = ip_set_pernet(net); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (optval != SO_IP_SET) return -EBADF; if (*len < sizeof(unsigned int)) return -EINVAL; data = vmalloc(*len); if (!data) return -ENOMEM; if (copy_from_user(data, user, *len) != 0) { ret = -EFAULT; goto done; } op = data; if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ struct ip_set_req_version *req_version = data; if (*len < sizeof(struct ip_set_req_version)) { ret = -EINVAL; goto done; } if (req_version->version < IPSET_PROTOCOL_MIN) { ret = -EPROTO; goto done; } } switch (*op) { case IP_SET_OP_VERSION: { struct ip_set_req_version *req_version = data; if (*len != sizeof(struct ip_set_req_version)) { ret = -EINVAL; goto done; } req_version->version = IPSET_PROTOCOL; if (copy_to_user(user, req_version, sizeof(struct ip_set_req_version))) ret = -EFAULT; goto done; } case IP_SET_OP_GET_BYNAME: { struct ip_set_req_get_set *req_get = data; ip_set_id_t id; if (*len != sizeof(struct ip_set_req_get_set)) { ret = -EINVAL; goto done; } req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; nfnl_lock(NFNL_SUBSYS_IPSET); find_set_and_id(inst, req_get->set.name, &id); req_get->set.index = id; nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } case IP_SET_OP_GET_FNAME: { struct ip_set_req_get_set_family *req_get = data; ip_set_id_t id; if (*len != sizeof(struct ip_set_req_get_set_family)) { ret = -EINVAL; goto done; } req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; nfnl_lock(NFNL_SUBSYS_IPSET); find_set_and_id(inst, req_get->set.name, &id); req_get->set.index = id; if (id != IPSET_INVALID_ID) req_get->family = ip_set(inst, id)->family; nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } case IP_SET_OP_GET_BYINDEX: { struct ip_set_req_get_set *req_get = data; struct ip_set *set; if (*len != sizeof(struct ip_set_req_get_set) || req_get->set.index >= inst->ip_set_max) { ret = -EINVAL; goto done; } nfnl_lock(NFNL_SUBSYS_IPSET); set = ip_set(inst, req_get->set.index); ret = strscpy(req_get->set.name, set ? set->name : "", IPSET_MAXNAMELEN); nfnl_unlock(NFNL_SUBSYS_IPSET); if (ret < 0) goto done; goto copy; } default: ret = -EBADMSG; goto done; } /* end of switch(op) */ copy: if (copy_to_user(user, data, copylen)) ret = -EFAULT; done: vfree(data); if (ret > 0) ret = 0; return ret; } static struct nf_sockopt_ops so_set __read_mostly = { .pf = PF_INET, .get_optmin = SO_IP_SET, .get_optmax = SO_IP_SET + 1, .get = ip_set_sockfn_get, .owner = THIS_MODULE, }; static int __net_init ip_set_net_init(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set **list; inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX; if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; inst->is_deleted = false; inst->is_destroyed = false; rcu_assign_pointer(inst->ip_set_list, list); return 0; } static void __net_exit ip_set_net_pre_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); inst->is_deleted = true; /* flag for ip_set_nfnl_put */ } static void __net_exit ip_set_net_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); _destroy_all_sets(inst); kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, .pre_exit = ip_set_net_pre_exit, .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), }; static int __init ip_set_init(void) { int ret = register_pernet_subsys(&ip_set_net_ops); if (ret) { pr_err("ip_set: cannot register pernet_subsys.\n"); return ret; } ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); unregister_pernet_subsys(&ip_set_net_ops); return ret; } ret = nf_register_sockopt(&so_set); if (ret != 0) { pr_err("SO_SET registry failed: %d\n", ret); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); unregister_pernet_subsys(&ip_set_net_ops); return ret; } return 0; } static void __exit ip_set_fini(void) { nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); unregister_pernet_subsys(&ip_set_net_ops); /* Wait for call_rcu() in destroy */ rcu_barrier(); pr_debug("these are the famous last words\n"); } module_init(ip_set_init); module_exit(ip_set_fini); MODULE_DESCRIPTION("ip_set: protocol " __stringify(IPSET_PROTOCOL));
95 52 52 1 100 1 1 2 96 33 3 6 7 3 1 43 52 2 2 1 1 1 1 1 3 1 1 1 1 1 1 1 2 1 1 1 13 2 1 10 8 1 1 1 1 1 1 1 1 6 492 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/rculist.h> #include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/netfilter.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static unsigned int nfct_timeout_id __read_mostly; struct ctnl_timeout { struct list_head head; struct list_head free_head; struct rcu_head rcu_head; refcount_t refcnt; char name[CTNL_TIMEOUT_NAME_MAX]; /* must be at the end */ struct nf_ct_timeout timeout; }; struct nfct_timeout_pernet { struct list_head nfct_timeout_list; struct list_head nfct_timeout_freelist; }; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING, .len = CTNL_TIMEOUT_NAME_MAX - 1}, [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 }, [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 }, [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED }, }; static struct nfct_timeout_pernet *nfct_timeout_pernet(struct net *net) { return net_generic(net, nfct_timeout_id); } static int ctnl_timeout_parse_policy(void *timeout, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { struct nlattr **tb; int ret = 0; tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); if (!tb) return -ENOMEM; ret = nla_parse_nested_deprecated(tb, l4proto->ctnl_timeout.nlattr_max, attr, l4proto->ctnl_timeout.nla_policy, NULL); if (ret < 0) goto err; ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout); err: kfree(tb); return ret; } static int cttimeout_new_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); __u16 l3num; __u8 l4num; const struct nf_conntrack_l4proto *l4proto; struct ctnl_timeout *timeout, *matching = NULL; char *name; int ret; if (!cda[CTA_TIMEOUT_NAME] || !cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); list_for_each_entry(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = timeout; break; } if (matching) { if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* You cannot replace one timeout policy by another of * different kind, sorry. */ if (matching->timeout.l3num != l3num || matching->timeout.l4proto->l4proto != l4num) return -EINVAL; return ctnl_timeout_parse_policy(&matching->timeout.data, matching->timeout.l4proto, info->net, cda[CTA_TIMEOUT_DATA]); } return -EBUSY; } l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supportted, skip. */ if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err_proto_put; } timeout = kzalloc(sizeof(struct ctnl_timeout) + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); if (timeout == NULL) { ret = -ENOMEM; goto err_proto_put; } ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); timeout->timeout.l3num = l3num; timeout->timeout.l4proto = l4proto; refcount_set(&timeout->refcnt, 1); __module_get(THIS_MODULE); list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list); return 0; err: kfree(timeout); err_proto_put: return ret; } static int ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct ctnl_timeout *timeout) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->timeout.l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto) || nla_put_be32(skb, CTA_TIMEOUT_USE, htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->timeout.data); if (ret < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nfct_timeout_pernet *pernet; struct net *net = sock_net(skb->sk); struct ctnl_timeout *cur, *last; if (cb->args[2]) return 0; last = (struct ctnl_timeout *)cb->args[1]; if (cb->args[1]) cb->args[1] = 0; rcu_read_lock(); pernet = nfct_timeout_pernet(net); list_for_each_entry_rcu(cur, &pernet->nfct_timeout_list, head) { if (last) { if (cur != last) continue; last = NULL; } if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) { cb->args[1] = (unsigned long)cur; break; } } if (!cb->args[1]) cb->args[2] = 1; rcu_read_unlock(); return skb->len; } static int cttimeout_get_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); int ret = -ENOENT; char *name; struct ctnl_timeout *cur; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnl_timeout_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!cda[CTA_TIMEOUT_NAME]) return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; break; } ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); break; } return ret; } /* try to delete object, fail if it is still in use. */ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) { int ret = 0; /* We want to avoid races with ctnl_timeout_put. So only when the * current refcnt is 1, we decrease it to 0. */ if (refcount_dec_if_one(&timeout->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_untimeout(net, &timeout->timeout); kfree_rcu(timeout, rcu_head); } else { ret = -EBUSY; } return ret; } static int cttimeout_del_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; if (!cda[CTA_TIMEOUT_NAME]) { list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) ctnl_timeout_try_del(info->net, cur); return 0; } name = nla_data(cda[CTA_TIMEOUT_NAME]); list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; ret = ctnl_timeout_try_del(info->net, cur); if (ret < 0) return ret; break; } return ret; } static int cttimeout_default_set(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; __u8 l4num; int ret; if (!cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supported, skip. */ if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err; } ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; return 0; err: return ret; } static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, u16 l3num, const struct nf_conntrack_l4proto *l4proto, const unsigned int *timeouts) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); if (ret < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int cttimeout_default_get(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; __u16 l3num; __u8 l4num; int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); if (l4proto->l4proto != l4num) return -EOPNOTSUPP; switch (l4proto->l4proto) { case IPPROTO_ICMP: timeouts = &nf_icmp_pernet(info->net)->timeout; break; case IPPROTO_TCP: timeouts = nf_tcp_pernet(info->net)->timeouts; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(info->net)->timeouts; break; case IPPROTO_ICMPV6: timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; case IPPROTO_SCTP: #ifdef CONFIG_NF_CT_PROTO_SCTP timeouts = nf_sctp_pernet(info->net)->timeouts; #endif break; case IPPROTO_GRE: #ifdef CONFIG_NF_CT_PROTO_GRE timeouts = nf_gre_pernet(info->net)->timeouts; #endif break; case 255: timeouts = &nf_generic_pernet(info->net)->timeout; break; default: WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); break; } if (!timeouts) return -EOPNOTSUPP; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; ret = cttimeout_default_fill_info(info->net, skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, const char *name) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *timeout, *matching = NULL; list_for_each_entry_rcu(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; if (!refcount_inc_not_zero(&timeout->refcnt)) goto err; matching = timeout; break; } err: return matching ? &matching->timeout : NULL; } static void ctnl_timeout_put(struct nf_ct_timeout *t) { struct ctnl_timeout *timeout = container_of(t, struct ctnl_timeout, timeout); if (refcount_dec_and_test(&timeout->refcnt)) { kfree_rcu(timeout, rcu_head); module_put(THIS_MODULE); } } static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = { .call = cttimeout_default_set, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = { .call = cttimeout_default_get, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, }; static const struct nfnetlink_subsystem cttimeout_subsys = { .name = "conntrack_timeout", .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT, .cb_count = IPCTNL_MSG_TIMEOUT_MAX, .cb = cttimeout_cb, }; MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); static int __net_init cttimeout_net_init(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); INIT_LIST_HEAD(&pernet->nfct_timeout_list); INIT_LIST_HEAD(&pernet->nfct_timeout_freelist); return 0; } static void __net_exit cttimeout_net_pre_exit(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) { list_del_rcu(&cur->head); list_add(&cur->free_head, &pernet->nfct_timeout_freelist); } /* core calls synchronize_rcu() after this */ } static void __net_exit cttimeout_net_exit(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; if (list_empty(&pernet->nfct_timeout_freelist)) return; nf_ct_untimeout(net, NULL); list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) { list_del(&cur->free_head); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); } } static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, .pre_exit = cttimeout_net_pre_exit, .exit = cttimeout_net_exit, .id = &nfct_timeout_id, .size = sizeof(struct nfct_timeout_pernet), }; static const struct nf_ct_timeout_hooks hooks = { .timeout_find_get = ctnl_timeout_find_get, .timeout_put = ctnl_timeout_put, }; static int __init cttimeout_init(void) { int ret; ret = register_pernet_subsys(&cttimeout_ops); if (ret < 0) return ret; ret = nfnetlink_subsys_register(&cttimeout_subsys); if (ret < 0) { pr_err("cttimeout_init: cannot register cttimeout with " "nfnetlink.\n"); goto err_out; } RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks); return 0; err_out: unregister_pernet_subsys(&cttimeout_ops); return ret; } static int untimeout(struct nf_conn *ct, void *timeout) { struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) RCU_INIT_POINTER(timeout_ext->timeout, NULL); return 0; } static void __exit cttimeout_exit(void) { nfnetlink_subsys_unregister(&cttimeout_subsys); unregister_pernet_subsys(&cttimeout_ops); RCU_INIT_POINTER(nf_ct_timeout_hook, NULL); nf_ct_iterate_destroy(untimeout, NULL); } module_init(cttimeout_init); module_exit(cttimeout_exit);
2 112 112 7 3 1 2 54 6 1 5 91 1 90 4 27 62 61 15 83 89 89 89 3 7 4 4 7 1 5 1 1 52 1 44 8 6 51 36 9 21 4 2 37 39 38 1 19 3 9 8 75 75 70 5 3 1 73 75 84 1 83 3 1 3 1 2 4 71 3 60 32 74 36 56 163 1 3 2 134 1 1 20 1 10 10 144 1 129 14 55 1 73 203 1 202 113 107 107 2 2 1 1 80 85 73 73 1 1 1 1 1 1 1 1 2 3 2 1 1 1 1 1 36 1 2 1 1 1 1 3 1 1 2 1 114 1 1 25 1 1 38 80 55 519 518 340 134 17 16 17 3 14 5 6 6 5 1 5 4 1 1 2 4 41 157 110 66 4 105 1 104 66 224 155 69 115 109 52 172 214 214 91 214 214 36 50 102 158 8 6 8 6 123 123 66 65 52 21 32 5 40 5 5 14 237 234 2 23 236 57 259 258 88 89 89 7 90 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 // SPDX-License-Identifier: GPL-2.0-or-later /* * History: * Started: Aug 9 by Lawrence Foard (entropy@world.std.com), * to allow user process control of SCSI devices. * Development Sponsored by Killy Corp. NY NY * * Original driver (sg.c): * Copyright (C) 1992 Lawrence Foard * Version 2 and 3 extensions to driver: * Copyright (C) 1998 - 2014 Douglas Gilbert */ static int sg_version_num = 30536; /* 2 digits for each component */ #define SG_VERSION_STR "3.5.36" /* * D. P. Gilbert (dgilbert@interlog.com), notes: * - scsi logging is available via SCSI_LOG_TIMEOUT macros. First * the kernel/module needs to be built with CONFIG_SCSI_LOGGING * (otherwise the macros compile to empty statements). * */ #include <linux/module.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/errno.h> #include <linux/mtio.h> #include <linux/ioctl.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/moduleparam.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/blktrace_api.h> #include <linux/mutex.h> #include <linux/atomic.h> #include <linux/ratelimit.h> #include <linux/uio.h> #include <linux/cred.h> /* for sg_check_file_access() */ #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsi_tcq.h> #include <scsi/sg.h> #include "scsi_logging.h" #ifdef CONFIG_SCSI_PROC_FS #include <linux/proc_fs.h> static char *sg_version_date = "20140603"; static int sg_proc_init(void); #endif #define SG_ALLOW_DIO_DEF 0 #define SG_MAX_DEVS (1 << MINORBITS) /* SG_MAX_CDB_SIZE should be 260 (spc4r37 section 3.1.30) however the type * of sg_io_hdr::cmd_len can only represent 255. All SCSI commands greater * than 16 bytes are "variable length" whose length is a multiple of 4 */ #define SG_MAX_CDB_SIZE 252 #define SG_DEFAULT_TIMEOUT mult_frac(SG_DEFAULT_TIMEOUT_USER, HZ, USER_HZ) static int sg_big_buff = SG_DEF_RESERVED_SIZE; /* N.B. This variable is readable and writeable via /proc/scsi/sg/def_reserved_size . Each time sg_open() is called a buffer of this size (or less if there is not enough memory) will be reserved for use by this file descriptor. [Deprecated usage: this variable is also readable via /proc/sys/kernel/sg-big-buff if the sg driver is built into the kernel (i.e. it is not a module).] */ static int def_reserved_size = -1; /* picks up init parameter */ static int sg_allow_dio = SG_ALLOW_DIO_DEF; static int scatter_elem_sz = SG_SCATTER_SZ; static int scatter_elem_sz_prev = SG_SCATTER_SZ; #define SG_SECTOR_SZ 512 static int sg_add_device(struct device *); static void sg_remove_device(struct device *); static DEFINE_IDR(sg_index_idr); static DEFINE_RWLOCK(sg_index_lock); /* Also used to lock file descriptor list for device */ static struct class_interface sg_interface = { .add_dev = sg_add_device, .remove_dev = sg_remove_device, }; typedef struct sg_scatter_hold { /* holding area for scsi scatter gather info */ unsigned short k_use_sg; /* Count of kernel scatter-gather pieces */ unsigned sglist_len; /* size of malloc'd scatter-gather list ++ */ unsigned bufflen; /* Size of (aggregate) data buffer */ struct page **pages; int page_order; char dio_in_use; /* 0->indirect IO (or mmap), 1->dio */ unsigned char cmd_opcode; /* first byte of command */ } Sg_scatter_hold; struct sg_device; /* forward declarations */ struct sg_fd; typedef struct sg_request { /* SG_MAX_QUEUE requests outstanding per file */ struct list_head entry; /* list entry */ struct sg_fd *parentfp; /* NULL -> not in use */ Sg_scatter_hold data; /* hold buffer, perhaps scatter list */ sg_io_hdr_t header; /* scsi command+info, see <scsi/sg.h> */ unsigned char sense_b[SCSI_SENSE_BUFFERSIZE]; char res_used; /* 1 -> using reserve buffer, 0 -> not ... */ char orphan; /* 1 -> drop on sight, 0 -> normal */ char sg_io_owned; /* 1 -> packet belongs to SG_IO */ /* done protected by rq_list_lock */ char done; /* 0->before bh, 1->before read, 2->read */ struct request *rq; struct bio *bio; struct execute_work ew; } Sg_request; typedef struct sg_fd { /* holds the state of a file descriptor */ struct list_head sfd_siblings; /* protected by device's sfd_lock */ struct sg_device *parentdp; /* owning device */ wait_queue_head_t read_wait; /* queue read until command done */ rwlock_t rq_list_lock; /* protect access to list in req_arr */ struct mutex f_mutex; /* protect against changes in this fd */ int timeout; /* defaults to SG_DEFAULT_TIMEOUT */ int timeout_user; /* defaults to SG_DEFAULT_TIMEOUT_USER */ Sg_scatter_hold reserve; /* buffer held for this file descriptor */ struct list_head rq_list; /* head of request list */ struct fasync_struct *async_qp; /* used by asynchronous notification */ Sg_request req_arr[SG_MAX_QUEUE]; /* used as singly-linked list */ char force_packid; /* 1 -> pack_id input to read(), 0 -> ignored */ char cmd_q; /* 1 -> allow command queuing, 0 -> don't */ unsigned char next_cmd_len; /* 0: automatic, >0: use on next write() */ char keep_orphan; /* 0 -> drop orphan (def), 1 -> keep for read() */ char mmap_called; /* 0 -> mmap() never called on this fd */ char res_in_use; /* 1 -> 'reserve' array in use */ struct kref f_ref; struct execute_work ew; } Sg_fd; typedef struct sg_device { /* holds the state of each scsi generic device */ struct scsi_device *device; wait_queue_head_t open_wait; /* queue open() when O_EXCL present */ struct mutex open_rel_lock; /* held when in open() or release() */ int sg_tablesize; /* adapter's max scatter-gather table size */ u32 index; /* device index number */ struct list_head sfds; rwlock_t sfd_lock; /* protect access to sfd list */ atomic_t detaching; /* 0->device usable, 1->device detaching */ bool exclude; /* 1->open(O_EXCL) succeeded and is active */ int open_cnt; /* count of opens (perhaps < num(sfds) ) */ char sgdebug; /* 0->off, 1->sense, 9->dump dev, 10-> all devs */ char name[DISK_NAME_LEN]; struct cdev * cdev; /* char_dev [sysfs: /sys/cdev/major/sg<n>] */ struct kref d_ref; } Sg_device; /* tasklet or soft irq callback */ static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status); static int sg_start_req(Sg_request *srp, unsigned char *cmd); static int sg_finish_rem_req(Sg_request * srp); static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size); static ssize_t sg_new_read(Sg_fd * sfp, char __user *buf, size_t count, Sg_request * srp); static ssize_t sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, size_t count, int blocking, int read_only, int sg_io_owned, Sg_request **o_srp); static int sg_common_write(Sg_fd * sfp, Sg_request * srp, unsigned char *cmnd, int timeout, int blocking); static int sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer); static void sg_remove_scat(Sg_fd * sfp, Sg_scatter_hold * schp); static void sg_build_reserve(Sg_fd * sfp, int req_size); static void sg_link_reserve(Sg_fd * sfp, Sg_request * srp, int size); static void sg_unlink_reserve(Sg_fd * sfp, Sg_request * srp); static Sg_fd *sg_add_sfp(Sg_device * sdp); static void sg_remove_sfp(struct kref *); static Sg_request *sg_get_rq_mark(Sg_fd * sfp, int pack_id, bool *busy); static Sg_request *sg_add_request(Sg_fd * sfp); static int sg_remove_request(Sg_fd * sfp, Sg_request * srp); static Sg_device *sg_get_dev(int dev); static void sg_device_destroy(struct kref *kref); #define SZ_SG_HEADER sizeof(struct sg_header) #define SZ_SG_IO_HDR sizeof(sg_io_hdr_t) #define SZ_SG_IOVEC sizeof(sg_iovec_t) #define SZ_SG_REQ_INFO sizeof(sg_req_info_t) #define sg_printk(prefix, sdp, fmt, a...) \ sdev_prefix_printk(prefix, (sdp)->device, (sdp)->name, fmt, ##a) /* * The SCSI interfaces that use read() and write() as an asynchronous variant of * ioctl(..., SG_IO, ...) are fundamentally unsafe, since there are lots of ways * to trigger read() and write() calls from various contexts with elevated * privileges. This can lead to kernel memory corruption (e.g. if these * interfaces are called through splice()) and privilege escalation inside * userspace (e.g. if a process with access to such a device passes a file * descriptor to a SUID binary as stdin/stdout/stderr). * * This function provides protection for the legacy API by restricting the * calling context. */ static int sg_check_file_access(struct file *filp, const char *caller) { if (filp->f_cred != current_real_cred()) { pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", caller, task_tgid_vnr(current), current->comm); return -EPERM; } return 0; } static int sg_allow_access(struct file *filp, unsigned char *cmd) { struct sg_fd *sfp = filp->private_data; if (sfp->parentdp->device->type == TYPE_SCANNER) return 0; if (!scsi_cmd_allowed(cmd, filp->f_mode & FMODE_WRITE)) return -EPERM; return 0; } static int open_wait(Sg_device *sdp, int flags) { int retval = 0; if (flags & O_EXCL) { while (sdp->open_cnt > 0) { mutex_unlock(&sdp->open_rel_lock); retval = wait_event_interruptible(sdp->open_wait, (atomic_read(&sdp->detaching) || !sdp->open_cnt)); mutex_lock(&sdp->open_rel_lock); if (retval) /* -ERESTARTSYS */ return retval; if (atomic_read(&sdp->detaching)) return -ENODEV; } } else { while (sdp->exclude) { mutex_unlock(&sdp->open_rel_lock); retval = wait_event_interruptible(sdp->open_wait, (atomic_read(&sdp->detaching) || !sdp->exclude)); mutex_lock(&sdp->open_rel_lock); if (retval) /* -ERESTARTSYS */ return retval; if (atomic_read(&sdp->detaching)) return -ENODEV; } } return retval; } /* Returns 0 on success, else a negated errno value */ static int sg_open(struct inode *inode, struct file *filp) { int dev = iminor(inode); int flags = filp->f_flags; struct request_queue *q; struct scsi_device *device; Sg_device *sdp; Sg_fd *sfp; int retval; nonseekable_open(inode, filp); if ((flags & O_EXCL) && (O_RDONLY == (flags & O_ACCMODE))) return -EPERM; /* Can't lock it with read only access */ sdp = sg_get_dev(dev); if (IS_ERR(sdp)) return PTR_ERR(sdp); SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_open: flags=0x%x\n", flags)); /* This driver's module count bumped by fops_get in <linux/fs.h> */ /* Prevent the device driver from vanishing while we sleep */ device = sdp->device; retval = scsi_device_get(device); if (retval) goto sg_put; /* scsi_block_when_processing_errors() may block so bypass * check if O_NONBLOCK. Permits SCSI commands to be issued * during error recovery. Tread carefully. */ if (!((flags & O_NONBLOCK) || scsi_block_when_processing_errors(device))) { retval = -ENXIO; /* we are in error recovery for this device */ goto sdp_put; } mutex_lock(&sdp->open_rel_lock); if (flags & O_NONBLOCK) { if (flags & O_EXCL) { if (sdp->open_cnt > 0) { retval = -EBUSY; goto error_mutex_locked; } } else { if (sdp->exclude) { retval = -EBUSY; goto error_mutex_locked; } } } else { retval = open_wait(sdp, flags); if (retval) /* -ERESTARTSYS or -ENODEV */ goto error_mutex_locked; } /* N.B. at this point we are holding the open_rel_lock */ if (flags & O_EXCL) sdp->exclude = true; if (sdp->open_cnt < 1) { /* no existing opens */ sdp->sgdebug = 0; q = device->request_queue; sdp->sg_tablesize = queue_max_segments(q); } sfp = sg_add_sfp(sdp); if (IS_ERR(sfp)) { retval = PTR_ERR(sfp); goto out_undo; } filp->private_data = sfp; sdp->open_cnt++; mutex_unlock(&sdp->open_rel_lock); retval = 0; sg_put: kref_put(&sdp->d_ref, sg_device_destroy); return retval; out_undo: if (flags & O_EXCL) { sdp->exclude = false; /* undo if error */ wake_up_interruptible(&sdp->open_wait); } error_mutex_locked: mutex_unlock(&sdp->open_rel_lock); sdp_put: kref_put(&sdp->d_ref, sg_device_destroy); scsi_device_put(device); return retval; } /* Release resources associated with a successful sg_open() * Returns 0 on success, else a negated errno value */ static int sg_release(struct inode *inode, struct file *filp) { Sg_device *sdp; Sg_fd *sfp; if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp))) return -ENXIO; SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_release\n")); mutex_lock(&sdp->open_rel_lock); sdp->open_cnt--; /* possibly many open()s waiting on exlude clearing, start many; * only open(O_EXCL)s wait on 0==open_cnt so only start one */ if (sdp->exclude) { sdp->exclude = false; wake_up_interruptible_all(&sdp->open_wait); } else if (0 == sdp->open_cnt) { wake_up_interruptible(&sdp->open_wait); } mutex_unlock(&sdp->open_rel_lock); kref_put(&sfp->f_ref, sg_remove_sfp); return 0; } static int get_sg_io_pack_id(int *pack_id, void __user *buf, size_t count) { struct sg_header __user *old_hdr = buf; int reply_len; if (count >= SZ_SG_HEADER) { /* negative reply_len means v3 format, otherwise v1/v2 */ if (get_user(reply_len, &old_hdr->reply_len)) return -EFAULT; if (reply_len >= 0) return get_user(*pack_id, &old_hdr->pack_id); if (in_compat_syscall() && count >= sizeof(struct compat_sg_io_hdr)) { struct compat_sg_io_hdr __user *hp = buf; return get_user(*pack_id, &hp->pack_id); } if (count >= sizeof(struct sg_io_hdr)) { struct sg_io_hdr __user *hp = buf; return get_user(*pack_id, &hp->pack_id); } } /* no valid header was passed, so ignore the pack_id */ *pack_id = -1; return 0; } static ssize_t sg_read(struct file *filp, char __user *buf, size_t count, loff_t * ppos) { Sg_device *sdp; Sg_fd *sfp; Sg_request *srp; int req_pack_id = -1; bool busy; sg_io_hdr_t *hp; struct sg_header *old_hdr; int retval; /* * This could cause a response to be stranded. Close the associated * file descriptor to free up any resources being held. */ retval = sg_check_file_access(filp, __func__); if (retval) return retval; if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp))) return -ENXIO; SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_read: count=%d\n", (int) count)); if (sfp->force_packid) retval = get_sg_io_pack_id(&req_pack_id, buf, count); if (retval) return retval; srp = sg_get_rq_mark(sfp, req_pack_id, &busy); if (!srp) { /* now wait on packet to arrive */ if (filp->f_flags & O_NONBLOCK) return -EAGAIN; retval = wait_event_interruptible(sfp->read_wait, ((srp = sg_get_rq_mark(sfp, req_pack_id, &busy)) || (!busy && atomic_read(&sdp->detaching)))); if (!srp) /* signal or detaching */ return retval ? retval : -ENODEV; } if (srp->header.interface_id != '\0') return sg_new_read(sfp, buf, count, srp); hp = &srp->header; old_hdr = kzalloc(SZ_SG_HEADER, GFP_KERNEL); if (!old_hdr) return -ENOMEM; old_hdr->reply_len = (int) hp->timeout; old_hdr->pack_len = old_hdr->reply_len; /* old, strange behaviour */ old_hdr->pack_id = hp->pack_id; old_hdr->twelve_byte = ((srp->data.cmd_opcode >= 0xc0) && (12 == hp->cmd_len)) ? 1 : 0; old_hdr->target_status = hp->masked_status; old_hdr->host_status = hp->host_status; old_hdr->driver_status = hp->driver_status; if ((CHECK_CONDITION & hp->masked_status) || (srp->sense_b[0] & 0x70) == 0x70) { old_hdr->driver_status = DRIVER_SENSE; memcpy(old_hdr->sense_buffer, srp->sense_b, sizeof (old_hdr->sense_buffer)); } switch (hp->host_status) { /* This setup of 'result' is for backward compatibility and is best ignored by the user who should use target, host + driver status */ case DID_OK: case DID_PASSTHROUGH: case DID_SOFT_ERROR: old_hdr->result = 0; break; case DID_NO_CONNECT: case DID_BUS_BUSY: case DID_TIME_OUT: old_hdr->result = EBUSY; break; case DID_BAD_TARGET: case DID_ABORT: case DID_PARITY: case DID_RESET: case DID_BAD_INTR: old_hdr->result = EIO; break; case DID_ERROR: old_hdr->result = (srp->sense_b[0] == 0 && hp->masked_status == GOOD) ? 0 : EIO; break; default: old_hdr->result = EIO; break; } /* Now copy the result back to the user buffer. */ if (count >= SZ_SG_HEADER) { if (copy_to_user(buf, old_hdr, SZ_SG_HEADER)) { retval = -EFAULT; goto free_old_hdr; } buf += SZ_SG_HEADER; if (count > old_hdr->reply_len) count = old_hdr->reply_len; if (count > SZ_SG_HEADER) { if (sg_read_oxfer(srp, buf, count - SZ_SG_HEADER)) { retval = -EFAULT; goto free_old_hdr; } } } else count = (old_hdr->result == 0) ? 0 : -EIO; sg_finish_rem_req(srp); sg_remove_request(sfp, srp); retval = count; free_old_hdr: kfree(old_hdr); return retval; } static ssize_t sg_new_read(Sg_fd * sfp, char __user *buf, size_t count, Sg_request * srp) { sg_io_hdr_t *hp = &srp->header; int err = 0, err2; int len; if (in_compat_syscall()) { if (count < sizeof(struct compat_sg_io_hdr)) { err = -EINVAL; goto err_out; } } else if (count < SZ_SG_IO_HDR) { err = -EINVAL; goto err_out; } hp->sb_len_wr = 0; if ((hp->mx_sb_len > 0) && hp->sbp) { if ((CHECK_CONDITION & hp->masked_status) || (srp->sense_b[0] & 0x70) == 0x70) { int sb_len = SCSI_SENSE_BUFFERSIZE; sb_len = (hp->mx_sb_len > sb_len) ? sb_len : hp->mx_sb_len; len = 8 + (int) srp->sense_b[7]; /* Additional sense length field */ len = (len > sb_len) ? sb_len : len; if (copy_to_user(hp->sbp, srp->sense_b, len)) { err = -EFAULT; goto err_out; } hp->driver_status = DRIVER_SENSE; hp->sb_len_wr = len; } } if (hp->masked_status || hp->host_status || hp->driver_status) hp->info |= SG_INFO_CHECK; err = put_sg_io_hdr(hp, buf); err_out: err2 = sg_finish_rem_req(srp); sg_remove_request(sfp, srp); return err ? : err2 ? : count; } static ssize_t sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) { int mxsize, cmd_size, k; int input_size, blocking; unsigned char opcode; Sg_device *sdp; Sg_fd *sfp; Sg_request *srp; struct sg_header old_hdr; sg_io_hdr_t *hp; unsigned char cmnd[SG_MAX_CDB_SIZE]; int retval; retval = sg_check_file_access(filp, __func__); if (retval) return retval; if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp))) return -ENXIO; SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_write: count=%d\n", (int) count)); if (atomic_read(&sdp->detaching)) return -ENODEV; if (!((filp->f_flags & O_NONBLOCK) || scsi_block_when_processing_errors(sdp->device))) return -ENXIO; if (count < SZ_SG_HEADER) return -EIO; if (copy_from_user(&old_hdr, buf, SZ_SG_HEADER)) return -EFAULT; blocking = !(filp->f_flags & O_NONBLOCK); if (old_hdr.reply_len < 0) return sg_new_write(sfp, filp, buf, count, blocking, 0, 0, NULL); if (count < (SZ_SG_HEADER + 6)) return -EIO; /* The minimum scsi command length is 6 bytes. */ buf += SZ_SG_HEADER; if (get_user(opcode, buf)) return -EFAULT; if (!(srp = sg_add_request(sfp))) { SCSI_LOG_TIMEOUT(1, sg_printk(KERN_INFO, sdp, "sg_write: queue full\n")); return -EDOM; } mutex_lock(&sfp->f_mutex); if (sfp->next_cmd_len > 0) { cmd_size = sfp->next_cmd_len; sfp->next_cmd_len = 0; /* reset so only this write() effected */ } else { cmd_size = COMMAND_SIZE(opcode); /* based on SCSI command group */ if ((opcode >= 0xc0) && old_hdr.twelve_byte) cmd_size = 12; } mutex_unlock(&sfp->f_mutex); SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sdp, "sg_write: scsi opcode=0x%02x, cmd_size=%d\n", (int) opcode, cmd_size)); /* Determine buffer size. */ input_size = count - cmd_size; mxsize = (input_size > old_hdr.reply_len) ? input_size : old_hdr.reply_len; mxsize -= SZ_SG_HEADER; input_size -= SZ_SG_HEADER; if (input_size < 0) { sg_remove_request(sfp, srp); return -EIO; /* User did not pass enough bytes for this command. */ } hp = &srp->header; hp->interface_id = '\0'; /* indicator of old interface tunnelled */ hp->cmd_len = (unsigned char) cmd_size; hp->iovec_count = 0; hp->mx_sb_len = 0; if (input_size > 0) hp->dxfer_direction = (old_hdr.reply_len > SZ_SG_HEADER) ? SG_DXFER_TO_FROM_DEV : SG_DXFER_TO_DEV; else hp->dxfer_direction = (mxsize > 0) ? SG_DXFER_FROM_DEV : SG_DXFER_NONE; hp->dxfer_len = mxsize; if ((hp->dxfer_direction == SG_DXFER_TO_DEV) || (hp->dxfer_direction == SG_DXFER_TO_FROM_DEV)) hp->dxferp = (char __user *)buf + cmd_size; else hp->dxferp = NULL; hp->sbp = NULL; hp->timeout = old_hdr.reply_len; /* structure abuse ... */ hp->flags = input_size; /* structure abuse ... */ hp->pack_id = old_hdr.pack_id; hp->usr_ptr = NULL; if (copy_from_user(cmnd, buf, cmd_size)) { sg_remove_request(sfp, srp); return -EFAULT; } /* * SG_DXFER_TO_FROM_DEV is functionally equivalent to SG_DXFER_FROM_DEV, * but is is possible that the app intended SG_DXFER_TO_DEV, because there * is a non-zero input_size, so emit a warning. */ if (hp->dxfer_direction == SG_DXFER_TO_FROM_DEV) { printk_ratelimited(KERN_WARNING "sg_write: data in/out %d/%d bytes " "for SCSI command 0x%x-- guessing " "data in;\n program %s not setting " "count and/or reply_len properly\n", old_hdr.reply_len - (int)SZ_SG_HEADER, input_size, (unsigned int) cmnd[0], current->comm); } k = sg_common_write(sfp, srp, cmnd, sfp->timeout, blocking); return (k < 0) ? k : count; } static ssize_t sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, size_t count, int blocking, int read_only, int sg_io_owned, Sg_request **o_srp) { int k; Sg_request *srp; sg_io_hdr_t *hp; unsigned char cmnd[SG_MAX_CDB_SIZE]; int timeout; unsigned long ul_timeout; if (count < SZ_SG_IO_HDR) return -EINVAL; sfp->cmd_q = 1; /* when sg_io_hdr seen, set command queuing on */ if (!(srp = sg_add_request(sfp))) { SCSI_LOG_TIMEOUT(1, sg_printk(KERN_INFO, sfp->parentdp, "sg_new_write: queue full\n")); return -EDOM; } srp->sg_io_owned = sg_io_owned; hp = &srp->header; if (get_sg_io_hdr(hp, buf)) { sg_remove_request(sfp, srp); return -EFAULT; } if (hp->interface_id != 'S') { sg_remove_request(sfp, srp); return -ENOSYS; } if (hp->flags & SG_FLAG_MMAP_IO) { if (hp->dxfer_len > sfp->reserve.bufflen) { sg_remove_request(sfp, srp); return -ENOMEM; /* MMAP_IO size must fit in reserve buffer */ } if (hp->flags & SG_FLAG_DIRECT_IO) { sg_remove_request(sfp, srp); return -EINVAL; /* either MMAP_IO or DIRECT_IO (not both) */ } if (sfp->res_in_use) { sg_remove_request(sfp, srp); return -EBUSY; /* reserve buffer already being used */ } } ul_timeout = msecs_to_jiffies(srp->header.timeout); timeout = (ul_timeout < INT_MAX) ? ul_timeout : INT_MAX; if ((!hp->cmdp) || (hp->cmd_len < 6) || (hp->cmd_len > sizeof (cmnd))) { sg_remove_request(sfp, srp); return -EMSGSIZE; } if (copy_from_user(cmnd, hp->cmdp, hp->cmd_len)) { sg_remove_request(sfp, srp); return -EFAULT; } if (read_only && sg_allow_access(file, cmnd)) { sg_remove_request(sfp, srp); return -EPERM; } k = sg_common_write(sfp, srp, cmnd, timeout, blocking); if (k < 0) return k; if (o_srp) *o_srp = srp; return count; } static int sg_common_write(Sg_fd * sfp, Sg_request * srp, unsigned char *cmnd, int timeout, int blocking) { int k, at_head; Sg_device *sdp = sfp->parentdp; sg_io_hdr_t *hp = &srp->header; srp->data.cmd_opcode = cmnd[0]; /* hold opcode of command */ hp->status = 0; hp->masked_status = 0; hp->msg_status = 0; hp->info = 0; hp->host_status = 0; hp->driver_status = 0; hp->resid = 0; SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sfp->parentdp, "sg_common_write: scsi opcode=0x%02x, cmd_size=%d\n", (int) cmnd[0], (int) hp->cmd_len)); if (hp->dxfer_len >= SZ_256M) { sg_remove_request(sfp, srp); return -EINVAL; } k = sg_start_req(srp, cmnd); if (k) { SCSI_LOG_TIMEOUT(1, sg_printk(KERN_INFO, sfp->parentdp, "sg_common_write: start_req err=%d\n", k)); sg_finish_rem_req(srp); sg_remove_request(sfp, srp); return k; /* probably out of space --> ENOMEM */ } if (atomic_read(&sdp->detaching)) { if (srp->bio) { blk_mq_free_request(srp->rq); srp->rq = NULL; } sg_finish_rem_req(srp); sg_remove_request(sfp, srp); return -ENODEV; } hp->duration = jiffies_to_msecs(jiffies); if (hp->interface_id != '\0' && /* v3 (or later) interface */ (SG_FLAG_Q_AT_TAIL & hp->flags)) at_head = 0; else at_head = 1; srp->rq->timeout = timeout; kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */ srp->rq->end_io = sg_rq_end_io; blk_execute_rq_nowait(srp->rq, at_head); return 0; } static int srp_done(Sg_fd *sfp, Sg_request *srp) { unsigned long flags; int ret; read_lock_irqsave(&sfp->rq_list_lock, flags); ret = srp->done; read_unlock_irqrestore(&sfp->rq_list_lock, flags); return ret; } static int max_sectors_bytes(struct request_queue *q) { unsigned int max_sectors = queue_max_sectors(q); max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9); return max_sectors << 9; } static void sg_fill_request_table(Sg_fd *sfp, sg_req_info_t *rinfo) { Sg_request *srp; int val; unsigned int ms; val = 0; list_for_each_entry(srp, &sfp->rq_list, entry) { if (val >= SG_MAX_QUEUE) break; rinfo[val].req_state = srp->done + 1; rinfo[val].problem = srp->header.masked_status & srp->header.host_status & srp->header.driver_status; if (srp->done) rinfo[val].duration = srp->header.duration; else { ms = jiffies_to_msecs(jiffies); rinfo[val].duration = (ms > srp->header.duration) ? (ms - srp->header.duration) : 0; } rinfo[val].orphan = srp->orphan; rinfo[val].sg_io_owned = srp->sg_io_owned; rinfo[val].pack_id = srp->header.pack_id; rinfo[val].usr_ptr = srp->header.usr_ptr; val++; } } #ifdef CONFIG_COMPAT struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ char req_state; char orphan; char sg_io_owned; char problem; int pack_id; compat_uptr_t usr_ptr; unsigned int duration; int unused; }; static int put_compat_request_table(struct compat_sg_req_info __user *o, struct sg_req_info *rinfo) { int i; for (i = 0; i < SG_MAX_QUEUE; i++) { if (copy_to_user(o + i, rinfo + i, offsetof(sg_req_info_t, usr_ptr)) || put_user((uintptr_t)rinfo[i].usr_ptr, &o[i].usr_ptr) || put_user(rinfo[i].duration, &o[i].duration) || put_user(rinfo[i].unused, &o[i].unused)) return -EFAULT; } return 0; } #endif static long sg_ioctl_common(struct file *filp, Sg_device *sdp, Sg_fd *sfp, unsigned int cmd_in, void __user *p) { int __user *ip = p; int result, val, read_only; Sg_request *srp; unsigned long iflags; SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_ioctl: cmd=0x%x\n", (int) cmd_in)); read_only = (O_RDWR != (filp->f_flags & O_ACCMODE)); switch (cmd_in) { case SG_IO: if (atomic_read(&sdp->detaching)) return -ENODEV; if (!scsi_block_when_processing_errors(sdp->device)) return -ENXIO; result = sg_new_write(sfp, filp, p, SZ_SG_IO_HDR, 1, read_only, 1, &srp); if (result < 0) return result; result = wait_event_interruptible(sfp->read_wait, srp_done(sfp, srp)); write_lock_irq(&sfp->rq_list_lock); if (srp->done) { srp->done = 2; write_unlock_irq(&sfp->rq_list_lock); result = sg_new_read(sfp, p, SZ_SG_IO_HDR, srp); return (result < 0) ? result : 0; } srp->orphan = 1; write_unlock_irq(&sfp->rq_list_lock); return result; /* -ERESTARTSYS because signal hit process */ case SG_SET_TIMEOUT: result = get_user(val, ip); if (result) return result; if (val < 0) return -EIO; if (val >= mult_frac((s64)INT_MAX, USER_HZ, HZ)) val = min_t(s64, mult_frac((s64)INT_MAX, USER_HZ, HZ), INT_MAX); sfp->timeout_user = val; sfp->timeout = mult_frac(val, HZ, USER_HZ); return 0; case SG_GET_TIMEOUT: /* N.B. User receives timeout as return value */ /* strange ..., for backward compatibility */ return sfp->timeout_user; case SG_SET_FORCE_LOW_DMA: /* * N.B. This ioctl never worked properly, but failed to * return an error value. So returning '0' to keep compability * with legacy applications. */ return 0; case SG_GET_LOW_DMA: return put_user(0, ip); case SG_GET_SCSI_ID: { sg_scsi_id_t v; if (atomic_read(&sdp->detaching)) return -ENODEV; memset(&v, 0, sizeof(v)); v.host_no = sdp->device->host->host_no; v.channel = sdp->device->channel; v.scsi_id = sdp->device->id; v.lun = sdp->device->lun; v.scsi_type = sdp->device->type; v.h_cmd_per_lun = sdp->device->host->cmd_per_lun; v.d_queue_depth = sdp->device->queue_depth; if (copy_to_user(p, &v, sizeof(sg_scsi_id_t))) return -EFAULT; return 0; } case SG_SET_FORCE_PACK_ID: result = get_user(val, ip); if (result) return result; sfp->force_packid = val ? 1 : 0; return 0; case SG_GET_PACK_ID: read_lock_irqsave(&sfp->rq_list_lock, iflags); list_for_each_entry(srp, &sfp->rq_list, entry) { if ((1 == srp->done) && (!srp->sg_io_owned)) { read_unlock_irqrestore(&sfp->rq_list_lock, iflags); return put_user(srp->header.pack_id, ip); } } read_unlock_irqrestore(&sfp->rq_list_lock, iflags); return put_user(-1, ip); case SG_GET_NUM_WAITING: read_lock_irqsave(&sfp->rq_list_lock, iflags); val = 0; list_for_each_entry(srp, &sfp->rq_list, entry) { if ((1 == srp->done) && (!srp->sg_io_owned)) ++val; } read_unlock_irqrestore(&sfp->rq_list_lock, iflags); return put_user(val, ip); case SG_GET_SG_TABLESIZE: return put_user(sdp->sg_tablesize, ip); case SG_SET_RESERVED_SIZE: result = get_user(val, ip); if (result) return result; if (val < 0) return -EINVAL; val = min_t(int, val, max_sectors_bytes(sdp->device->request_queue)); mutex_lock(&sfp->f_mutex); if (val != sfp->reserve.bufflen) { if (sfp->mmap_called || sfp->res_in_use) { mutex_unlock(&sfp->f_mutex); return -EBUSY; } sg_remove_scat(sfp, &sfp->reserve); sg_build_reserve(sfp, val); } mutex_unlock(&sfp->f_mutex); return 0; case SG_GET_RESERVED_SIZE: val = min_t(int, sfp->reserve.bufflen, max_sectors_bytes(sdp->device->request_queue)); return put_user(val, ip); case SG_SET_COMMAND_Q: result = get_user(val, ip); if (result) return result; sfp->cmd_q = val ? 1 : 0; return 0; case SG_GET_COMMAND_Q: return put_user((int) sfp->cmd_q, ip); case SG_SET_KEEP_ORPHAN: result = get_user(val, ip); if (result) return result; sfp->keep_orphan = val; return 0; case SG_GET_KEEP_ORPHAN: return put_user((int) sfp->keep_orphan, ip); case SG_NEXT_CMD_LEN: result = get_user(val, ip); if (result) return result; if (val > SG_MAX_CDB_SIZE) return -ENOMEM; sfp->next_cmd_len = (val > 0) ? val : 0; return 0; case SG_GET_VERSION_NUM: return put_user(sg_version_num, ip); case SG_GET_ACCESS_COUNT: /* faked - we don't have a real access count anymore */ val = (sdp->device ? 1 : 0); return put_user(val, ip); case SG_GET_REQUEST_TABLE: { sg_req_info_t *rinfo; rinfo = kcalloc(SG_MAX_QUEUE, SZ_SG_REQ_INFO, GFP_KERNEL); if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); sg_fill_request_table(sfp, rinfo); read_unlock_irqrestore(&sfp->rq_list_lock, iflags); #ifdef CONFIG_COMPAT if (in_compat_syscall()) result = put_compat_request_table(p, rinfo); else #endif result = copy_to_user(p, rinfo, SZ_SG_REQ_INFO * SG_MAX_QUEUE); result = result ? -EFAULT : 0; kfree(rinfo); return result; } case SG_EMULATED_HOST: if (atomic_read(&sdp->detaching)) return -ENODEV; return put_user(sdp->device->host->hostt->emulated, ip); case SCSI_IOCTL_SEND_COMMAND: if (atomic_read(&sdp->detaching)) return -ENODEV; return scsi_ioctl(sdp->device, filp->f_mode & FMODE_WRITE, cmd_in, p); case SG_SET_DEBUG: result = get_user(val, ip); if (result) return result; sdp->sgdebug = (char) val; return 0; case BLKSECTGET: return put_user(max_sectors_bytes(sdp->device->request_queue), ip); case BLKTRACESETUP: return blk_trace_setup(sdp->device->request_queue, sdp->name, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), NULL, p); case BLKTRACESTART: return blk_trace_startstop(sdp->device->request_queue, 1); case BLKTRACESTOP: return blk_trace_startstop(sdp->device->request_queue, 0); case BLKTRACETEARDOWN: return blk_trace_remove(sdp->device->request_queue); case SCSI_IOCTL_GET_IDLUN: case SCSI_IOCTL_GET_BUS_NUMBER: case SCSI_IOCTL_PROBE_HOST: case SG_GET_TRANSFORM: case SG_SCSI_RESET: if (atomic_read(&sdp->detaching)) return -ENODEV; break; default: if (read_only) return -EPERM; /* don't know so take safe approach */ break; } result = scsi_ioctl_block_when_processing_errors(sdp->device, cmd_in, filp->f_flags & O_NDELAY); if (result) return result; return -ENOIOCTLCMD; } static long sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) { void __user *p = (void __user *)arg; Sg_device *sdp; Sg_fd *sfp; int ret; if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp))) return -ENXIO; ret = sg_ioctl_common(filp, sdp, sfp, cmd_in, p); if (ret != -ENOIOCTLCMD) return ret; return scsi_ioctl(sdp->device, filp->f_mode & FMODE_WRITE, cmd_in, p); } static __poll_t sg_poll(struct file *filp, poll_table * wait) { __poll_t res = 0; Sg_device *sdp; Sg_fd *sfp; Sg_request *srp; int count = 0; unsigned long iflags; sfp = filp->private_data; if (!sfp) return EPOLLERR; sdp = sfp->parentdp; if (!sdp) return EPOLLERR; poll_wait(filp, &sfp->read_wait, wait); read_lock_irqsave(&sfp->rq_list_lock, iflags); list_for_each_entry(srp, &sfp->rq_list, entry) { /* if any read waiting, flag it */ if ((0 == res) && (1 == srp->done) && (!srp->sg_io_owned)) res = EPOLLIN | EPOLLRDNORM; ++count; } read_unlock_irqrestore(&sfp->rq_list_lock, iflags); if (atomic_read(&sdp->detaching)) res |= EPOLLHUP; else if (!sfp->cmd_q) { if (0 == count) res |= EPOLLOUT | EPOLLWRNORM; } else if (count < SG_MAX_QUEUE) res |= EPOLLOUT | EPOLLWRNORM; SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_poll: res=0x%x\n", (__force u32) res)); return res; } static int sg_fasync(int fd, struct file *filp, int mode) { Sg_device *sdp; Sg_fd *sfp; if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp))) return -ENXIO; SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_fasync: mode=%d\n", mode)); return fasync_helper(fd, filp, mode, &sfp->async_qp); } static vm_fault_t sg_vma_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; Sg_fd *sfp; unsigned long offset, len, sa; Sg_scatter_hold *rsv_schp; int k, length; if ((NULL == vma) || (!(sfp = (Sg_fd *) vma->vm_private_data))) return VM_FAULT_SIGBUS; rsv_schp = &sfp->reserve; offset = vmf->pgoff << PAGE_SHIFT; if (offset >= rsv_schp->bufflen) return VM_FAULT_SIGBUS; SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sfp->parentdp, "sg_vma_fault: offset=%lu, scatg=%d\n", offset, rsv_schp->k_use_sg)); sa = vma->vm_start; length = 1 << (PAGE_SHIFT + rsv_schp->page_order); for (k = 0; k < rsv_schp->k_use_sg && sa < vma->vm_end; k++) { len = vma->vm_end - sa; len = (len < length) ? len : length; if (offset < len) { struct page *page = nth_page(rsv_schp->pages[k], offset >> PAGE_SHIFT); get_page(page); /* increment page count */ vmf->page = page; return 0; /* success */ } sa += len; offset -= len; } return VM_FAULT_SIGBUS; } static const struct vm_operations_struct sg_mmap_vm_ops = { .fault = sg_vma_fault, }; static int sg_mmap(struct file *filp, struct vm_area_struct *vma) { Sg_fd *sfp; unsigned long req_sz, len, sa; Sg_scatter_hold *rsv_schp; int k, length; int ret = 0; if ((!filp) || (!vma) || (!(sfp = (Sg_fd *) filp->private_data))) return -ENXIO; req_sz = vma->vm_end - vma->vm_start; SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sfp->parentdp, "sg_mmap starting, vm_start=%p, len=%d\n", (void *) vma->vm_start, (int) req_sz)); if (vma->vm_pgoff) return -EINVAL; /* want no offset */ rsv_schp = &sfp->reserve; mutex_lock(&sfp->f_mutex); if (req_sz > rsv_schp->bufflen) { ret = -ENOMEM; /* cannot map more than reserved buffer */ goto out; } sa = vma->vm_start; length = 1 << (PAGE_SHIFT + rsv_schp->page_order); for (k = 0; k < rsv_schp->k_use_sg && sa < vma->vm_end; k++) { len = vma->vm_end - sa; len = (len < length) ? len : length; sa += len; } sfp->mmap_called = 1; vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = sfp; vma->vm_ops = &sg_mmap_vm_ops; out: mutex_unlock(&sfp->f_mutex); return ret; } static void sg_rq_end_io_usercontext(struct work_struct *work) { struct sg_request *srp = container_of(work, struct sg_request, ew.work); struct sg_fd *sfp = srp->parentfp; sg_finish_rem_req(srp); sg_remove_request(sfp, srp); kref_put(&sfp->f_ref, sg_remove_sfp); } /* * This function is a "bottom half" handler that is called by the mid * level when a command is completed (or has failed). */ static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); struct sg_request *srp = rq->end_io_data; Sg_device *sdp; Sg_fd *sfp; unsigned long iflags; unsigned int ms; char *sense; int result, resid, done = 1; if (WARN_ON(srp->done != 0)) return RQ_END_IO_NONE; sfp = srp->parentfp; if (WARN_ON(sfp == NULL)) return RQ_END_IO_NONE; sdp = sfp->parentdp; if (unlikely(atomic_read(&sdp->detaching))) pr_info("%s: device detaching\n", __func__); sense = scmd->sense_buffer; result = scmd->result; resid = scmd->resid_len; SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sdp, "sg_cmd_done: pack_id=%d, res=0x%x\n", srp->header.pack_id, result)); srp->header.resid = resid; ms = jiffies_to_msecs(jiffies); srp->header.duration = (ms > srp->header.duration) ? (ms - srp->header.duration) : 0; if (0 != result) { struct scsi_sense_hdr sshdr; srp->header.status = 0xff & result; srp->header.masked_status = sg_status_byte(result); srp->header.msg_status = COMMAND_COMPLETE; srp->header.host_status = host_byte(result); srp->header.driver_status = driver_byte(result); if ((sdp->sgdebug > 0) && ((CHECK_CONDITION == srp->header.masked_status) || (COMMAND_TERMINATED == srp->header.masked_status))) __scsi_print_sense(sdp->device, __func__, sense, SCSI_SENSE_BUFFERSIZE); /* Following if statement is a patch supplied by Eric Youngdale */ if (driver_byte(result) != 0 && scsi_normalize_sense(sense, SCSI_SENSE_BUFFERSIZE, &sshdr) && !scsi_sense_is_deferred(&sshdr) && sshdr.sense_key == UNIT_ATTENTION && sdp->device->removable) { /* Detected possible disc change. Set the bit - this */ /* may be used if there are filesystems using this device */ sdp->device->changed = 1; } } if (scmd->sense_len) memcpy(srp->sense_b, scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE); /* Rely on write phase to clean out srp status values, so no "else" */ /* * Free the request as soon as it is complete so that its resources * can be reused without waiting for userspace to read() the * result. But keep the associated bio (if any) around until * blk_rq_unmap_user() can be called from user context. */ srp->rq = NULL; blk_mq_free_request(rq); write_lock_irqsave(&sfp->rq_list_lock, iflags); if (unlikely(srp->orphan)) { if (sfp->keep_orphan) srp->sg_io_owned = 0; else done = 0; } srp->done = done; write_unlock_irqrestore(&sfp->rq_list_lock, iflags); if (likely(done)) { /* Now wake up any sg_read() that is waiting for this * packet. */ wake_up_interruptible(&sfp->read_wait); kill_fasync(&sfp->async_qp, SIGPOLL, POLL_IN); kref_put(&sfp->f_ref, sg_remove_sfp); } else { INIT_WORK(&srp->ew.work, sg_rq_end_io_usercontext); schedule_work(&srp->ew.work); } return RQ_END_IO_NONE; } static const struct file_operations sg_fops = { .owner = THIS_MODULE, .read = sg_read, .write = sg_write, .poll = sg_poll, .unlocked_ioctl = sg_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = sg_open, .mmap = sg_mmap, .release = sg_release, .fasync = sg_fasync, }; static const struct class sg_sysfs_class = { .name = "scsi_generic" }; static int sg_sysfs_valid = 0; static Sg_device * sg_alloc(struct scsi_device *scsidp) { struct request_queue *q = scsidp->request_queue; Sg_device *sdp; unsigned long iflags; int error; u32 k; sdp = kzalloc(sizeof(Sg_device), GFP_KERNEL); if (!sdp) { sdev_printk(KERN_WARNING, scsidp, "%s: kmalloc Sg_device " "failure\n", __func__); return ERR_PTR(-ENOMEM); } idr_preload(GFP_KERNEL); write_lock_irqsave(&sg_index_lock, iflags); error = idr_alloc(&sg_index_idr, sdp, 0, SG_MAX_DEVS, GFP_NOWAIT); if (error < 0) { if (error == -ENOSPC) { sdev_printk(KERN_WARNING, scsidp, "Unable to attach sg device type=%d, minor number exceeds %d\n", scsidp->type, SG_MAX_DEVS - 1); error = -ENODEV; } else { sdev_printk(KERN_WARNING, scsidp, "%s: idr " "allocation Sg_device failure: %d\n", __func__, error); } goto out_unlock; } k = error; SCSI_LOG_TIMEOUT(3, sdev_printk(KERN_INFO, scsidp, "sg_alloc: dev=%d \n", k)); sprintf(sdp->name, "sg%d", k); sdp->device = scsidp; mutex_init(&sdp->open_rel_lock); INIT_LIST_HEAD(&sdp->sfds); init_waitqueue_head(&sdp->open_wait); atomic_set(&sdp->detaching, 0); rwlock_init(&sdp->sfd_lock); sdp->sg_tablesize = queue_max_segments(q); sdp->index = k; kref_init(&sdp->d_ref); error = 0; out_unlock: write_unlock_irqrestore(&sg_index_lock, iflags); idr_preload_end(); if (error) { kfree(sdp); return ERR_PTR(error); } return sdp; } static int sg_add_device(struct device *cl_dev) { struct scsi_device *scsidp = to_scsi_device(cl_dev->parent); Sg_device *sdp = NULL; struct cdev * cdev = NULL; int error; unsigned long iflags; if (!blk_get_queue(scsidp->request_queue)) { pr_warn("%s: get scsi_device queue failed\n", __func__); return -ENODEV; } error = -ENOMEM; cdev = cdev_alloc(); if (!cdev) { pr_warn("%s: cdev_alloc failed\n", __func__); goto out; } cdev->owner = THIS_MODULE; cdev->ops = &sg_fops; sdp = sg_alloc(scsidp); if (IS_ERR(sdp)) { pr_warn("%s: sg_alloc failed\n", __func__); error = PTR_ERR(sdp); goto out; } error = cdev_add(cdev, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), 1); if (error) goto cdev_add_err; sdp->cdev = cdev; if (sg_sysfs_valid) { struct device *sg_class_member; sg_class_member = device_create(&sg_sysfs_class, cl_dev->parent, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), sdp, "%s", sdp->name); if (IS_ERR(sg_class_member)) { pr_err("%s: device_create failed\n", __func__); error = PTR_ERR(sg_class_member); goto cdev_add_err; } error = sysfs_create_link(&scsidp->sdev_gendev.kobj, &sg_class_member->kobj, "generic"); if (error) pr_err("%s: unable to make symlink 'generic' back " "to sg%d\n", __func__, sdp->index); } else pr_warn("%s: sg_sys Invalid\n", __func__); sdev_printk(KERN_NOTICE, scsidp, "Attached scsi generic sg%d " "type %d\n", sdp->index, scsidp->type); dev_set_drvdata(cl_dev, sdp); return 0; cdev_add_err: write_lock_irqsave(&sg_index_lock, iflags); idr_remove(&sg_index_idr, sdp->index); write_unlock_irqrestore(&sg_index_lock, iflags); kfree(sdp); out: if (cdev) cdev_del(cdev); blk_put_queue(scsidp->request_queue); return error; } static void sg_device_destroy(struct kref *kref) { struct sg_device *sdp = container_of(kref, struct sg_device, d_ref); struct request_queue *q = sdp->device->request_queue; unsigned long flags; /* CAUTION! Note that the device can still be found via idr_find() * even though the refcount is 0. Therefore, do idr_remove() BEFORE * any other cleanup. */ blk_trace_remove(q); blk_put_queue(q); write_lock_irqsave(&sg_index_lock, flags); idr_remove(&sg_index_idr, sdp->index); write_unlock_irqrestore(&sg_index_lock, flags); SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_device_destroy\n")); kfree(sdp); } static void sg_remove_device(struct device *cl_dev) { struct scsi_device *scsidp = to_scsi_device(cl_dev->parent); Sg_device *sdp = dev_get_drvdata(cl_dev); unsigned long iflags; Sg_fd *sfp; int val; if (!sdp) return; /* want sdp->detaching non-zero as soon as possible */ val = atomic_inc_return(&sdp->detaching); if (val > 1) return; /* only want to do following once per device */ SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "%s\n", __func__)); read_lock_irqsave(&sdp->sfd_lock, iflags); list_for_each_entry(sfp, &sdp->sfds, sfd_siblings) { wake_up_interruptible_all(&sfp->read_wait); kill_fasync(&sfp->async_qp, SIGPOLL, POLL_HUP); } wake_up_interruptible_all(&sdp->open_wait); read_unlock_irqrestore(&sdp->sfd_lock, iflags); sysfs_remove_link(&scsidp->sdev_gendev.kobj, "generic"); device_destroy(&sg_sysfs_class, MKDEV(SCSI_GENERIC_MAJOR, sdp->index)); cdev_del(sdp->cdev); sdp->cdev = NULL; kref_put(&sdp->d_ref, sg_device_destroy); } module_param_named(scatter_elem_sz, scatter_elem_sz, int, S_IRUGO | S_IWUSR); module_param_named(def_reserved_size, def_reserved_size, int, S_IRUGO | S_IWUSR); module_param_named(allow_dio, sg_allow_dio, int, S_IRUGO | S_IWUSR); MODULE_AUTHOR("Douglas Gilbert"); MODULE_DESCRIPTION("SCSI generic (sg) driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(SG_VERSION_STR); MODULE_ALIAS_CHARDEV_MAJOR(SCSI_GENERIC_MAJOR); MODULE_PARM_DESC(scatter_elem_sz, "scatter gather element " "size (default: max(SG_SCATTER_SZ, PAGE_SIZE))"); MODULE_PARM_DESC(def_reserved_size, "size of buffer reserved for each fd"); MODULE_PARM_DESC(allow_dio, "allow direct I/O (default: 0 (disallow))"); #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static const struct ctl_table sg_sysctls[] = { { .procname = "sg-big-buff", .data = &sg_big_buff, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, }; static struct ctl_table_header *hdr; static void register_sg_sysctls(void) { if (!hdr) hdr = register_sysctl("kernel", sg_sysctls); } static void unregister_sg_sysctls(void) { unregister_sysctl_table(hdr); } #else #define register_sg_sysctls() do { } while (0) #define unregister_sg_sysctls() do { } while (0) #endif /* CONFIG_SYSCTL */ static int __init init_sg(void) { int rc; if (scatter_elem_sz < PAGE_SIZE) { scatter_elem_sz = PAGE_SIZE; scatter_elem_sz_prev = scatter_elem_sz; } if (def_reserved_size >= 0) sg_big_buff = def_reserved_size; else def_reserved_size = sg_big_buff; rc = register_chrdev_region(MKDEV(SCSI_GENERIC_MAJOR, 0), SG_MAX_DEVS, "sg"); if (rc) return rc; rc = class_register(&sg_sysfs_class); if (rc) goto err_out; sg_sysfs_valid = 1; rc = scsi_register_interface(&sg_interface); if (0 == rc) { #ifdef CONFIG_SCSI_PROC_FS sg_proc_init(); #endif /* CONFIG_SCSI_PROC_FS */ return 0; } class_unregister(&sg_sysfs_class); register_sg_sysctls(); err_out: unregister_chrdev_region(MKDEV(SCSI_GENERIC_MAJOR, 0), SG_MAX_DEVS); return rc; } static void __exit exit_sg(void) { unregister_sg_sysctls(); #ifdef CONFIG_SCSI_PROC_FS remove_proc_subtree("scsi/sg", NULL); #endif /* CONFIG_SCSI_PROC_FS */ scsi_unregister_interface(&sg_interface); class_unregister(&sg_sysfs_class); sg_sysfs_valid = 0; unregister_chrdev_region(MKDEV(SCSI_GENERIC_MAJOR, 0), SG_MAX_DEVS); idr_destroy(&sg_index_idr); } static int sg_start_req(Sg_request *srp, unsigned char *cmd) { int res; struct request *rq; Sg_fd *sfp = srp->parentfp; sg_io_hdr_t *hp = &srp->header; int dxfer_len = (int) hp->dxfer_len; int dxfer_dir = hp->dxfer_direction; unsigned int iov_count = hp->iovec_count; Sg_scatter_hold *req_schp = &srp->data; Sg_scatter_hold *rsv_schp = &sfp->reserve; struct request_queue *q = sfp->parentdp->device->request_queue; struct rq_map_data *md, map_data; int rw = hp->dxfer_direction == SG_DXFER_TO_DEV ? ITER_SOURCE : ITER_DEST; struct scsi_cmnd *scmd; SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sfp->parentdp, "sg_start_req: dxfer_len=%d\n", dxfer_len)); /* * NOTE * * With scsi-mq enabled, there are a fixed number of preallocated * requests equal in number to shost->can_queue. If all of the * preallocated requests are already in use, then scsi_alloc_request() * will sleep until an active command completes, freeing up a request. * Although waiting in an asynchronous interface is less than ideal, we * do not want to use BLK_MQ_REQ_NOWAIT here because userspace might * not expect an EWOULDBLOCK from this condition. */ rq = scsi_alloc_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); scmd = blk_mq_rq_to_pdu(rq); if (hp->cmd_len > sizeof(scmd->cmnd)) { blk_mq_free_request(rq); return -EINVAL; } memcpy(scmd->cmnd, cmd, hp->cmd_len); scmd->cmd_len = hp->cmd_len; srp->rq = rq; rq->end_io_data = srp; scmd->allowed = SG_DEFAULT_RETRIES; if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE)) return 0; if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO && dxfer_dir != SG_DXFER_UNKNOWN && !iov_count && blk_rq_aligned(q, (unsigned long)hp->dxferp, dxfer_len)) md = NULL; else md = &map_data; if (md) { mutex_lock(&sfp->f_mutex); if (dxfer_len <= rsv_schp->bufflen && !sfp->res_in_use) { sfp->res_in_use = 1; sg_link_reserve(sfp, srp, dxfer_len); } else if (hp->flags & SG_FLAG_MMAP_IO) { res = -EBUSY; /* sfp->res_in_use == 1 */ if (dxfer_len > rsv_schp->bufflen) res = -ENOMEM; mutex_unlock(&sfp->f_mutex); return res; } else { res = sg_build_indirect(req_schp, sfp, dxfer_len); if (res) { mutex_unlock(&sfp->f_mutex); return res; } } mutex_unlock(&sfp->f_mutex); md->pages = req_schp->pages; md->page_order = req_schp->page_order; md->nr_entries = req_schp->k_use_sg; md->offset = 0; md->null_mapped = hp->dxferp ? 0 : 1; if (dxfer_dir == SG_DXFER_TO_FROM_DEV) md->from_user = 1; else md->from_user = 0; } res = blk_rq_map_user_io(rq, md, hp->dxferp, hp->dxfer_len, GFP_ATOMIC, iov_count, iov_count, 1, rw); if (!res) { srp->bio = rq->bio; if (!md) { req_schp->dio_in_use = 1; hp->info |= SG_INFO_DIRECT_IO; } } return res; } static int sg_finish_rem_req(Sg_request *srp) { int ret = 0; Sg_fd *sfp = srp->parentfp; Sg_scatter_hold *req_schp = &srp->data; SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sfp->parentdp, "sg_finish_rem_req: res_used=%d\n", (int) srp->res_used)); if (srp->bio) ret = blk_rq_unmap_user(srp->bio); if (srp->rq) blk_mq_free_request(srp->rq); if (srp->res_used) sg_unlink_reserve(sfp, srp); else sg_remove_scat(sfp, req_schp); return ret; } static int sg_build_sgat(Sg_scatter_hold * schp, const Sg_fd * sfp, int tablesize) { int sg_bufflen = tablesize * sizeof(struct page *); gfp_t gfp_flags = GFP_ATOMIC | __GFP_NOWARN; schp->pages = kzalloc(sg_bufflen, gfp_flags); if (!schp->pages) return -ENOMEM; schp->sglist_len = sg_bufflen; return tablesize; /* number of scat_gath elements allocated */ } static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size) { int ret_sz = 0, i, k, rem_sz, num, mx_sc_elems; int sg_tablesize = sfp->parentdp->sg_tablesize; int blk_size = buff_size, order; gfp_t gfp_mask = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN | __GFP_ZERO; if (blk_size < 0) return -EFAULT; if (0 == blk_size) ++blk_size; /* don't know why */ /* round request up to next highest SG_SECTOR_SZ byte boundary */ blk_size = ALIGN(blk_size, SG_SECTOR_SZ); SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sfp->parentdp, "sg_build_indirect: buff_size=%d, blk_size=%d\n", buff_size, blk_size)); /* N.B. ret_sz carried into this block ... */ mx_sc_elems = sg_build_sgat(schp, sfp, sg_tablesize); if (mx_sc_elems < 0) return mx_sc_elems; /* most likely -ENOMEM */ num = scatter_elem_sz; if (unlikely(num != scatter_elem_sz_prev)) { if (num < PAGE_SIZE) { scatter_elem_sz = PAGE_SIZE; scatter_elem_sz_prev = PAGE_SIZE; } else scatter_elem_sz_prev = num; } order = get_order(num); retry: ret_sz = 1 << (PAGE_SHIFT + order); for (k = 0, rem_sz = blk_size; rem_sz > 0 && k < mx_sc_elems; k++, rem_sz -= ret_sz) { num = (rem_sz > scatter_elem_sz_prev) ? scatter_elem_sz_prev : rem_sz; schp->pages[k] = alloc_pages(gfp_mask, order); if (!schp->pages[k]) goto out; if (num == scatter_elem_sz_prev) { if (unlikely(ret_sz > scatter_elem_sz_prev)) { scatter_elem_sz = ret_sz; scatter_elem_sz_prev = ret_sz; } } SCSI_LOG_TIMEOUT(5, sg_printk(KERN_INFO, sfp->parentdp, "sg_build_indirect: k=%d, num=%d, ret_sz=%d\n", k, num, ret_sz)); } /* end of for loop */ schp->page_order = order; schp->k_use_sg = k; SCSI_LOG_TIMEOUT(5, sg_printk(KERN_INFO, sfp->parentdp, "sg_build_indirect: k_use_sg=%d, rem_sz=%d\n", k, rem_sz)); schp->bufflen = blk_size; if (rem_sz > 0) /* must have failed */ return -ENOMEM; return 0; out: for (i = 0; i < k; i++) __free_pages(schp->pages[i], order); if (--order >= 0) goto retry; return -ENOMEM; } static void sg_remove_scat(Sg_fd * sfp, Sg_scatter_hold * schp) { SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sfp->parentdp, "sg_remove_scat: k_use_sg=%d\n", schp->k_use_sg)); if (schp->pages && schp->sglist_len > 0) { if (!schp->dio_in_use) { int k; for (k = 0; k < schp->k_use_sg && schp->pages[k]; k++) { SCSI_LOG_TIMEOUT(5, sg_printk(KERN_INFO, sfp->parentdp, "sg_remove_scat: k=%d, pg=0x%p\n", k, schp->pages[k])); __free_pages(schp->pages[k], schp->page_order); } kfree(schp->pages); } } memset(schp, 0, sizeof (*schp)); } static int sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer) { Sg_scatter_hold *schp = &srp->data; int k, num; SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, srp->parentfp->parentdp, "sg_read_oxfer: num_read_xfer=%d\n", num_read_xfer)); if ((!outp) || (num_read_xfer <= 0)) return 0; num = 1 << (PAGE_SHIFT + schp->page_order); for (k = 0; k < schp->k_use_sg && schp->pages[k]; k++) { if (num > num_read_xfer) { if (copy_to_user(outp, page_address(schp->pages[k]), num_read_xfer)) return -EFAULT; break; } else { if (copy_to_user(outp, page_address(schp->pages[k]), num)) return -EFAULT; num_read_xfer -= num; if (num_read_xfer <= 0) break; outp += num; } } return 0; } static void sg_build_reserve(Sg_fd * sfp, int req_size) { Sg_scatter_hold *schp = &sfp->reserve; SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sfp->parentdp, "sg_build_reserve: req_size=%d\n", req_size)); do { if (req_size < PAGE_SIZE) req_size = PAGE_SIZE; if (0 == sg_build_indirect(schp, sfp, req_size)) return; else sg_remove_scat(sfp, schp); req_size >>= 1; /* divide by 2 */ } while (req_size > (PAGE_SIZE / 2)); } static void sg_link_reserve(Sg_fd * sfp, Sg_request * srp, int size) { Sg_scatter_hold *req_schp = &srp->data; Sg_scatter_hold *rsv_schp = &sfp->reserve; int k, num, rem; srp->res_used = 1; SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sfp->parentdp, "sg_link_reserve: size=%d\n", size)); rem = size; num = 1 << (PAGE_SHIFT + rsv_schp->page_order); for (k = 0; k < rsv_schp->k_use_sg; k++) { if (rem <= num) { req_schp->k_use_sg = k + 1; req_schp->sglist_len = rsv_schp->sglist_len; req_schp->pages = rsv_schp->pages; req_schp->bufflen = size; req_schp->page_order = rsv_schp->page_order; break; } else rem -= num; } if (k >= rsv_schp->k_use_sg) SCSI_LOG_TIMEOUT(1, sg_printk(KERN_INFO, sfp->parentdp, "sg_link_reserve: BAD size\n")); } static void sg_unlink_reserve(Sg_fd * sfp, Sg_request * srp) { Sg_scatter_hold *req_schp = &srp->data; SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, srp->parentfp->parentdp, "sg_unlink_reserve: req->k_use_sg=%d\n", (int) req_schp->k_use_sg)); req_schp->k_use_sg = 0; req_schp->bufflen = 0; req_schp->pages = NULL; req_schp->page_order = 0; req_schp->sglist_len = 0; srp->res_used = 0; /* Called without mutex lock to avoid deadlock */ sfp->res_in_use = 0; } static Sg_request * sg_get_rq_mark(Sg_fd * sfp, int pack_id, bool *busy) { Sg_request *resp; unsigned long iflags; *busy = false; write_lock_irqsave(&sfp->rq_list_lock, iflags); list_for_each_entry(resp, &sfp->rq_list, entry) { /* look for requests that are not SG_IO owned */ if ((!resp->sg_io_owned) && ((-1 == pack_id) || (resp->header.pack_id == pack_id))) { switch (resp->done) { case 0: /* request active */ *busy = true; break; case 1: /* request done; response ready to return */ resp->done = 2; /* guard against other readers */ write_unlock_irqrestore(&sfp->rq_list_lock, iflags); return resp; case 2: /* response already being returned */ break; } } } write_unlock_irqrestore(&sfp->rq_list_lock, iflags); return NULL; } /* always adds to end of list */ static Sg_request * sg_add_request(Sg_fd * sfp) { int k; unsigned long iflags; Sg_request *rp = sfp->req_arr; write_lock_irqsave(&sfp->rq_list_lock, iflags); if (!list_empty(&sfp->rq_list)) { if (!sfp->cmd_q) goto out_unlock; for (k = 0; k < SG_MAX_QUEUE; ++k, ++rp) { if (!rp->parentfp) break; } if (k >= SG_MAX_QUEUE) goto out_unlock; } memset(rp, 0, sizeof (Sg_request)); rp->parentfp = sfp; rp->header.duration = jiffies_to_msecs(jiffies); list_add_tail(&rp->entry, &sfp->rq_list); write_unlock_irqrestore(&sfp->rq_list_lock, iflags); return rp; out_unlock: write_unlock_irqrestore(&sfp->rq_list_lock, iflags); return NULL; } /* Return of 1 for found; 0 for not found */ static int sg_remove_request(Sg_fd * sfp, Sg_request * srp) { unsigned long iflags; int res = 0; if (!sfp || !srp || list_empty(&sfp->rq_list)) return res; write_lock_irqsave(&sfp->rq_list_lock, iflags); if (!list_empty(&srp->entry)) { list_del(&srp->entry); srp->parentfp = NULL; res = 1; } write_unlock_irqrestore(&sfp->rq_list_lock, iflags); /* * If the device is detaching, wakeup any readers in case we just * removed the last response, which would leave nothing for them to * return other than -ENODEV. */ if (unlikely(atomic_read(&sfp->parentdp->detaching))) wake_up_interruptible_all(&sfp->read_wait); return res; } static Sg_fd * sg_add_sfp(Sg_device * sdp) { Sg_fd *sfp; unsigned long iflags; int bufflen; sfp = kzalloc(sizeof(*sfp), GFP_ATOMIC | __GFP_NOWARN); if (!sfp) return ERR_PTR(-ENOMEM); init_waitqueue_head(&sfp->read_wait); rwlock_init(&sfp->rq_list_lock); INIT_LIST_HEAD(&sfp->rq_list); kref_init(&sfp->f_ref); mutex_init(&sfp->f_mutex); sfp->timeout = SG_DEFAULT_TIMEOUT; sfp->timeout_user = SG_DEFAULT_TIMEOUT_USER; sfp->force_packid = SG_DEF_FORCE_PACK_ID; sfp->cmd_q = SG_DEF_COMMAND_Q; sfp->keep_orphan = SG_DEF_KEEP_ORPHAN; sfp->parentdp = sdp; write_lock_irqsave(&sdp->sfd_lock, iflags); if (atomic_read(&sdp->detaching)) { write_unlock_irqrestore(&sdp->sfd_lock, iflags); kfree(sfp); return ERR_PTR(-ENODEV); } list_add_tail(&sfp->sfd_siblings, &sdp->sfds); write_unlock_irqrestore(&sdp->sfd_lock, iflags); SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_add_sfp: sfp=0x%p\n", sfp)); if (unlikely(sg_big_buff != def_reserved_size)) sg_big_buff = def_reserved_size; bufflen = min_t(int, sg_big_buff, max_sectors_bytes(sdp->device->request_queue)); sg_build_reserve(sfp, bufflen); SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_add_sfp: bufflen=%d, k_use_sg=%d\n", sfp->reserve.bufflen, sfp->reserve.k_use_sg)); kref_get(&sdp->d_ref); __module_get(THIS_MODULE); return sfp; } static void sg_remove_sfp_usercontext(struct work_struct *work) { struct sg_fd *sfp = container_of(work, struct sg_fd, ew.work); struct sg_device *sdp = sfp->parentdp; struct scsi_device *device = sdp->device; Sg_request *srp; unsigned long iflags; /* Cleanup any responses which were never read(). */ write_lock_irqsave(&sfp->rq_list_lock, iflags); while (!list_empty(&sfp->rq_list)) { srp = list_first_entry(&sfp->rq_list, Sg_request, entry); sg_finish_rem_req(srp); list_del(&srp->entry); srp->parentfp = NULL; } write_unlock_irqrestore(&sfp->rq_list_lock, iflags); if (sfp->reserve.bufflen > 0) { SCSI_LOG_TIMEOUT(6, sg_printk(KERN_INFO, sdp, "sg_remove_sfp: bufflen=%d, k_use_sg=%d\n", (int) sfp->reserve.bufflen, (int) sfp->reserve.k_use_sg)); sg_remove_scat(sfp, &sfp->reserve); } SCSI_LOG_TIMEOUT(6, sg_printk(KERN_INFO, sdp, "sg_remove_sfp: sfp=0x%p\n", sfp)); kfree(sfp); kref_put(&sdp->d_ref, sg_device_destroy); scsi_device_put(device); module_put(THIS_MODULE); } static void sg_remove_sfp(struct kref *kref) { struct sg_fd *sfp = container_of(kref, struct sg_fd, f_ref); struct sg_device *sdp = sfp->parentdp; unsigned long iflags; write_lock_irqsave(&sdp->sfd_lock, iflags); list_del(&sfp->sfd_siblings); write_unlock_irqrestore(&sdp->sfd_lock, iflags); INIT_WORK(&sfp->ew.work, sg_remove_sfp_usercontext); schedule_work(&sfp->ew.work); } #ifdef CONFIG_SCSI_PROC_FS static int sg_idr_max_id(int id, void *p, void *data) { int *k = data; if (*k < id) *k = id; return 0; } static int sg_last_dev(void) { int k = -1; unsigned long iflags; read_lock_irqsave(&sg_index_lock, iflags); idr_for_each(&sg_index_idr, sg_idr_max_id, &k); read_unlock_irqrestore(&sg_index_lock, iflags); return k + 1; /* origin 1 */ } #endif /* must be called with sg_index_lock held */ static Sg_device *sg_lookup_dev(int dev) { return idr_find(&sg_index_idr, dev); } static Sg_device * sg_get_dev(int dev) { struct sg_device *sdp; unsigned long flags; read_lock_irqsave(&sg_index_lock, flags); sdp = sg_lookup_dev(dev); if (!sdp) sdp = ERR_PTR(-ENXIO); else if (atomic_read(&sdp->detaching)) { /* If sdp->detaching, then the refcount may already be 0, in * which case it would be a bug to do kref_get(). */ sdp = ERR_PTR(-ENODEV); } else kref_get(&sdp->d_ref); read_unlock_irqrestore(&sg_index_lock, flags); return sdp; } #ifdef CONFIG_SCSI_PROC_FS static int sg_proc_seq_show_int(struct seq_file *s, void *v); static int sg_proc_single_open_adio(struct inode *inode, struct file *file); static ssize_t sg_proc_write_adio(struct file *filp, const char __user *buffer, size_t count, loff_t *off); static const struct proc_ops adio_proc_ops = { .proc_open = sg_proc_single_open_adio, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_write = sg_proc_write_adio, .proc_release = single_release, }; static int sg_proc_single_open_dressz(struct inode *inode, struct file *file); static ssize_t sg_proc_write_dressz(struct file *filp, const char __user *buffer, size_t count, loff_t *off); static const struct proc_ops dressz_proc_ops = { .proc_open = sg_proc_single_open_dressz, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_write = sg_proc_write_dressz, .proc_release = single_release, }; static int sg_proc_seq_show_version(struct seq_file *s, void *v); static int sg_proc_seq_show_devhdr(struct seq_file *s, void *v); static int sg_proc_seq_show_dev(struct seq_file *s, void *v); static void * dev_seq_start(struct seq_file *s, loff_t *pos); static void * dev_seq_next(struct seq_file *s, void *v, loff_t *pos); static void dev_seq_stop(struct seq_file *s, void *v); static const struct seq_operations dev_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, .stop = dev_seq_stop, .show = sg_proc_seq_show_dev, }; static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v); static const struct seq_operations devstrs_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, .stop = dev_seq_stop, .show = sg_proc_seq_show_devstrs, }; static int sg_proc_seq_show_debug(struct seq_file *s, void *v); static const struct seq_operations debug_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, .stop = dev_seq_stop, .show = sg_proc_seq_show_debug, }; static int sg_proc_init(void) { struct proc_dir_entry *p; p = proc_mkdir("scsi/sg", NULL); if (!p) return 1; proc_create("allow_dio", S_IRUGO | S_IWUSR, p, &adio_proc_ops); proc_create_seq("debug", S_IRUGO, p, &debug_seq_ops); proc_create("def_reserved_size", S_IRUGO | S_IWUSR, p, &dressz_proc_ops); proc_create_single("device_hdr", S_IRUGO, p, sg_proc_seq_show_devhdr); proc_create_seq("devices", S_IRUGO, p, &dev_seq_ops); proc_create_seq("device_strs", S_IRUGO, p, &devstrs_seq_ops); proc_create_single("version", S_IRUGO, p, sg_proc_seq_show_version); return 0; } static int sg_proc_seq_show_int(struct seq_file *s, void *v) { seq_printf(s, "%d\n", *((int *)s->private)); return 0; } static int sg_proc_single_open_adio(struct inode *inode, struct file *file) { return single_open(file, sg_proc_seq_show_int, &sg_allow_dio); } static ssize_t sg_proc_write_adio(struct file *filp, const char __user *buffer, size_t count, loff_t *off) { int err; unsigned long num; if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) return -EACCES; err = kstrtoul_from_user(buffer, count, 0, &num); if (err) return err; sg_allow_dio = num ? 1 : 0; return count; } static int sg_proc_single_open_dressz(struct inode *inode, struct file *file) { return single_open(file, sg_proc_seq_show_int, &sg_big_buff); } static ssize_t sg_proc_write_dressz(struct file *filp, const char __user *buffer, size_t count, loff_t *off) { int err; unsigned long k = ULONG_MAX; if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) return -EACCES; err = kstrtoul_from_user(buffer, count, 0, &k); if (err) return err; if (k <= 1048576) { /* limit "big buff" to 1 MB */ sg_big_buff = k; return count; } return -ERANGE; } static int sg_proc_seq_show_version(struct seq_file *s, void *v) { seq_printf(s, "%d\t%s [%s]\n", sg_version_num, SG_VERSION_STR, sg_version_date); return 0; } static int sg_proc_seq_show_devhdr(struct seq_file *s, void *v) { seq_puts(s, "host\tchan\tid\tlun\ttype\topens\tqdepth\tbusy\tonline\n"); return 0; } struct sg_proc_deviter { loff_t index; size_t max; }; static void * dev_seq_start(struct seq_file *s, loff_t *pos) { struct sg_proc_deviter * it = kmalloc(sizeof(*it), GFP_KERNEL); s->private = it; if (! it) return NULL; it->index = *pos; it->max = sg_last_dev(); if (it->index >= it->max) return NULL; return it; } static void * dev_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct sg_proc_deviter * it = s->private; *pos = ++it->index; return (it->index < it->max) ? it : NULL; } static void dev_seq_stop(struct seq_file *s, void *v) { kfree(s->private); } static int sg_proc_seq_show_dev(struct seq_file *s, void *v) { struct sg_proc_deviter * it = (struct sg_proc_deviter *) v; Sg_device *sdp; struct scsi_device *scsidp; unsigned long iflags; read_lock_irqsave(&sg_index_lock, iflags); sdp = it ? sg_lookup_dev(it->index) : NULL; if ((NULL == sdp) || (NULL == sdp->device) || (atomic_read(&sdp->detaching))) seq_puts(s, "-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\n"); else { scsidp = sdp->device; seq_printf(s, "%d\t%d\t%d\t%llu\t%d\t%d\t%d\t%d\t%d\n", scsidp->host->host_no, scsidp->channel, scsidp->id, scsidp->lun, (int) scsidp->type, 1, (int) scsidp->queue_depth, (int) scsi_device_busy(scsidp), (int) scsi_device_online(scsidp)); } read_unlock_irqrestore(&sg_index_lock, iflags); return 0; } static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v) { struct sg_proc_deviter * it = (struct sg_proc_deviter *) v; Sg_device *sdp; struct scsi_device *scsidp; unsigned long iflags; read_lock_irqsave(&sg_index_lock, iflags); sdp = it ? sg_lookup_dev(it->index) : NULL; scsidp = sdp ? sdp->device : NULL; if (sdp && scsidp && (!atomic_read(&sdp->detaching))) seq_printf(s, "%8.8s\t%16.16s\t%4.4s\n", scsidp->vendor, scsidp->model, scsidp->rev); else seq_puts(s, "<no active device>\n"); read_unlock_irqrestore(&sg_index_lock, iflags); return 0; } /* must be called while holding sg_index_lock */ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp) { int k, new_interface, blen, usg; Sg_request *srp; Sg_fd *fp; const sg_io_hdr_t *hp; const char * cp; unsigned int ms; k = 0; list_for_each_entry(fp, &sdp->sfds, sfd_siblings) { k++; read_lock(&fp->rq_list_lock); /* irqs already disabled */ seq_printf(s, " FD(%d): timeout=%dms bufflen=%d " "(res)sgat=%d low_dma=%d\n", k, jiffies_to_msecs(fp->timeout), fp->reserve.bufflen, (int) fp->reserve.k_use_sg, 0); seq_printf(s, " cmd_q=%d f_packid=%d k_orphan=%d closed=0\n", (int) fp->cmd_q, (int) fp->force_packid, (int) fp->keep_orphan); list_for_each_entry(srp, &fp->rq_list, entry) { hp = &srp->header; new_interface = (hp->interface_id == '\0') ? 0 : 1; if (srp->res_used) { if (new_interface && (SG_FLAG_MMAP_IO & hp->flags)) cp = " mmap>> "; else cp = " rb>> "; } else { if (SG_INFO_DIRECT_IO_MASK & hp->info) cp = " dio>> "; else cp = " "; } seq_puts(s, cp); blen = srp->data.bufflen; usg = srp->data.k_use_sg; seq_puts(s, srp->done ? ((1 == srp->done) ? "rcv:" : "fin:") : "act:"); seq_printf(s, " id=%d blen=%d", srp->header.pack_id, blen); if (srp->done) seq_printf(s, " dur=%d", hp->duration); else { ms = jiffies_to_msecs(jiffies); seq_printf(s, " t_o/elap=%d/%d", (new_interface ? hp->timeout : jiffies_to_msecs(fp->timeout)), (ms > hp->duration ? ms - hp->duration : 0)); } seq_printf(s, "ms sgat=%d op=0x%02x\n", usg, (int) srp->data.cmd_opcode); } if (list_empty(&fp->rq_list)) seq_puts(s, " No requests active\n"); read_unlock(&fp->rq_list_lock); } } static int sg_proc_seq_show_debug(struct seq_file *s, void *v) { struct sg_proc_deviter * it = (struct sg_proc_deviter *) v; Sg_device *sdp; unsigned long iflags; if (it && (0 == it->index)) seq_printf(s, "max_active_device=%d def_reserved_size=%d\n", (int)it->max, sg_big_buff); read_lock_irqsave(&sg_index_lock, iflags); sdp = it ? sg_lookup_dev(it->index) : NULL; if (NULL == sdp) goto skip; read_lock(&sdp->sfd_lock); if (!list_empty(&sdp->sfds)) { seq_printf(s, " >>> device=%s ", sdp->name); if (atomic_read(&sdp->detaching)) seq_puts(s, "detaching pending close "); else if (sdp->device) { struct scsi_device *scsidp = sdp->device; seq_printf(s, "%d:%d:%d:%llu em=%d", scsidp->host->host_no, scsidp->channel, scsidp->id, scsidp->lun, scsidp->host->hostt->emulated); } seq_printf(s, " sg_tablesize=%d excl=%d open_cnt=%d\n", sdp->sg_tablesize, sdp->exclude, sdp->open_cnt); sg_proc_debug_helper(s, sdp); } read_unlock(&sdp->sfd_lock); skip: read_unlock_irqrestore(&sg_index_lock, iflags); return 0; } #endif /* CONFIG_SCSI_PROC_FS */ module_init(init_sg); module_exit(exit_sg);
3430 3175 316 4 19 77 77 37 43 17 23 18 8 4 7 1 54 2102 173 3706 6 962 48 896 1880 1882 1880 3577 3576 3571 5 1562 2102 1 3570 4 15 3559 6 3226 406 3560 2 147 1314 1 182 2 1273 1 1410 24 5 985 15 929 1 1 1469 3 297 2 33 3285 310 34 276 391 2052 1288 3292 1041 170 11 941 569 1 1788 568 2101 2100 2101 659 3826 3839 3837 2004 1964 2245 14 3827 3826 1078 1080 1894 1902 17309 17292 984 714 277 19 17 18 18 18 14 16 2 274 22 1455 132 132 4 2 1 125 13 13 110 2 1 2 1 3 2 27 101 834 835 836 830 831 830 492 819 826 824 824 3762 84 3678 11 7 7 3676 140 42 1792 2069 480 480 18 52 52 78 78 78 336 336 336 2 2 1332 1333 1333 1332 1332 1333 11 11 1333 1333 6 1333 1332 54 1333 1333 76 1333 1332 1325 1332 6 1332 1328 1329 6 1333 1332 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 // SPDX-License-Identifier: GPL-2.0-only /* * mm/mmap.c * * Written by obz. * * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/init.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/ksm.h> #include <linux/memfd.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #define CREATE_TRACE_POINTS #include <trace/events/mmap.h> #include "internal.h" #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX; int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { vm_flags_t vm_flags = vma->vm_flags; pgprot_t vm_page_prot; vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. * @addr: The address to check * @len: The size of increase. * * Return: 0 on success. */ static int check_brk_limits(unsigned long addr, unsigned long len) { unsigned long mapped_addr; mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate = false; LIST_HEAD(uf); struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; origbrk = mm->brk; min_brk = mm->start_brk; #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ if (!current->brk_randomized) min_brk = mm->end_data; #endif if (brk < min_brk) goto out; /* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) { mm->brk = brk; goto success; } /* Always allow shrinking brk. */ if (brk <= mm->brk) { /* Search one past newbrk */ vma_iter_init(&vmi, mm, newbrk); brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. * do_vmi_align_munmap() will drop the lock on success, so * update it before calling do_vma_munmap(). */ mm->brk = brk; if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf, /* unlock = */ true)) goto out; goto success_unlocked; } if (check_brk_limits(oldbrk, newbrk - oldbrk)) goto out; /* * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ vma_iter_init(&vmi, mm, oldbrk); next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; if (mm->def_flags & VM_LOCKED) populate = true; success: mmap_write_unlock(mm); success_unlocked: userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: mm->brk = origbrk; mmap_write_unlock(mm); return origbrk; } /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr */ static inline unsigned long round_hint_to_min(unsigned long hint) { hint &= PAGE_MASK; if (((void *)hint != NULL) && (hint < mmap_min_addr)) return PAGE_ALIGN(mmap_min_addr); return hint; } bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; locked_pages += mm->locked_vm; limit_pages = rlimit(RLIMIT_MEMLOCK); limit_pages >>= PAGE_SHIFT; return locked_pages <= limit_pages; } static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) { if (S_ISREG(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISSOCK(inode->i_mode)) return MAX_LFS_FILESIZE; /* Special "we do even unsigned file positions" case */ if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET) return 0; /* Yes, random drivers might want more. But I'm tired of buggy drivers */ return ULONG_MAX; } static inline bool file_mmap_ok(struct file *file, struct inode *inode, unsigned long pgoff, unsigned long len) { u64 maxsize = file_mmap_size_max(file, inode); if (maxsize && len > maxsize) return false; maxsize -= len; if (pgoff > maxsize >> PAGE_SHIFT) return false; return true; } /** * do_mmap() - Perform a userland memory mapping into the current process * address space of length @len with protection bits @prot, mmap flags @flags * (from which VMA flags will be inferred), and any additional VMA flags to * apply @vm_flags. If this is a file-backed mapping then the file is specified * in @file and page offset into the file via @pgoff. * * This function does not perform security checks on the file and assumes, if * @uf is non-NULL, the caller has provided a list head to track unmap events * for userfaultfd @uf. * * It also simply indicates whether memory population is required by setting * @populate, which must be non-NULL, expecting the caller to actually perform * this task itself if appropriate. * * This function will invoke architecture-specific (and if provided and * relevant, file system-specific) logic to determine the most appropriate * unmapped area in which to place the mapping if not MAP_FIXED. * * Callers which require userland mmap() behaviour should invoke vm_mmap(), * which is also exported for module use. * * Those which require this behaviour less security checks, userfaultfd and * populate behaviour, and who handle the mmap write lock themselves, should * call this function. * * Note that the returned address may reside within a merged VMA if an * appropriate merge were to take place, so it doesn't necessarily specify the * start of a VMA, rather only the start of a valid mapped range of length * @len bytes, rounded down to the nearest page size. * * The caller must write-lock current->mm->mmap_lock. * * @file: An optional struct file pointer describing the file which is to be * mapped, if a file-backed mapping. * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the * address at which to perform this mapping. See mmap (2) for details. Must be * page-aligned. * @len: The length of the mapping. Will be page-aligned and must be at least 1 * page in size. * @prot: Protection bits describing access required to the mapping. See mmap * (2) for details. * @flags: Flags specifying how the mapping should be performed, see mmap (2) * for details. * @vm_flags: VMA flags which should be set by default, or 0 otherwise. * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. * @populate: A pointer to a value which will be set to 0 if no population of * the range is required, or the number of bytes to populate if it is. Must be * non-NULL. See mmap (2) for details as to under what circumstances population * of the range occurs. * @uf: An optional pointer to a list head to track userfaultfd unmap events * should unmapping events arise. If provided, it is up to the caller to manage * this. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; *populate = 0; mmap_assert_write_locked(mm); if (!len) return -EINVAL; /* * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec * mounted, in which case we don't add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; /* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len) return -ENOMEM; /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; /* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) return -ENOMEM; /* * addr is returned from get_unmapped_area, * There are two cases: * 1> MAP_FIXED == false * unallocated memory, no need to check sealing. * 1> MAP_FIXED == true * sealing is checked inside mmap_region when * do_vmi_munmap is called. */ if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) pkey = 0; } /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags); if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; if (!mlock_future_ok(mm, vm_flags, len)) return -EAGAIN; if (file) { struct inode *inode = file_inode(file); unsigned long flags_mask; int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; flags_mask = LEGACY_MAP_MASK; if (file->f_op->fop_flags & FOP_MMAP_SYNC) flags_mask |= MAP_SYNC; switch (flags & MAP_TYPE) { case MAP_SHARED: /* * Force use of MAP_SHARED_VALIDATE with non-legacy * flags. E.g. MAP_SYNC is dangerous to use with * MAP_SHARED as you don't know which consistency model * you will get. We silently ignore unsupported flags * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; fallthrough; case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; if (prot & PROT_WRITE) { if (!(file->f_mode & FMODE_WRITE)) return -EACCES; if (IS_SWAPFILE(file->f_mapping->host)) return -ETXTBSY; } /* * Make sure we don't allow writing to an append-only * file.. */ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; } if (!can_mmap_file(file)) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; break; default: return -EINVAL; } /* * Check to see if we are violating any seals and update VMA * flags if necessary to avoid future seal violations. */ err = memfd_check_seals_mmap(file, &vm_flags); if (err) return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; /* * Ignore pgoff. */ pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_DROPPABLE: if (VM_DROPPABLE == VM_NONE) return -ENOTSUPP; /* * A locked or stack area makes no sense to be droppable. * * Also, since droppable pages can just go away at any time * it makes no sense to copy them on fork or dump them. * * And don't attempt to combine with hugetlb for now. */ if (flags & (MAP_LOCKED | MAP_HUGETLB)) return -EINVAL; if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) return -EINVAL; vm_flags |= VM_DROPPABLE; /* * If the pages can be dropped, then it doesn't make * sense to reserve them. */ vm_flags |= VM_NORESERVE; /* * Likewise, they're volatile enough that they * shouldn't survive forks or coredumps. */ vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; fallthrough; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; } } /* * Set 'VM_NORESERVE' if we should not account for the * memory use of this mapping. */ if (flags & MAP_NORESERVE) { /* We honor MAP_NORESERVE if allowed to overcommit */ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE; /* hugetlb applies strict overcommit unless MAP_NORESERVE */ if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { struct file *file = NULL; unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); if (!file) return -EBADF; if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (unlikely(flags & MAP_HUGETLB)) { retval = -EINVAL; goto out_fput; } } else if (flags & MAP_HUGETLB) { struct hstate *hs; hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) fput(file); return retval; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } #ifdef __ARCH_WANT_SYS_OLD_MMAP struct mmap_arg_struct { unsigned long addr; unsigned long len; unsigned long prot; unsigned long flags; unsigned long fd; unsigned long offset; }; SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) { struct mmap_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; if (offset_in_page(a.offset)) return -EINVAL; return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ /* * Determine if the allocation needs to ensure that there is no * existing mapping within it's guard gaps, for use as start_gap. */ static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) { if (vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } /* * Search for an unmapped address range. * * We are looking for a range that: * - does not intersect with any VMA; * - is contained within the [low_limit, high_limit) interval; * - is at least the desired size. * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) */ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { unsigned long addr; if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) addr = unmapped_area_topdown(info); else addr = unmapped_area(info); trace_vm_unmapped_area(addr, info); return addr; } /* Get an address range which is currently unmapped. * For shmat() with addr=0. * * Ugly calling convention alert: * Return value with the low bits set means error value, * ie * if (ret & ~PAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. */ unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); return vm_unmapped_area(&info); } #ifndef HAVE_ARCH_UNMAPPED_AREA unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } #endif /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): */ unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); /* requested length too big for entire address space */ if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = mmap_end; addr = vm_unmapped_area(&info); } return addr; } #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); } #endif unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { if (test_bit(MMF_TOPDOWN, &mm->flags)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; unsigned long error = arch_mmap_check(addr, len, flags); if (error) return error; /* Careful about overflows.. */ if (len > TASK_SIZE) return -ENOMEM; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; } else if (flags & MAP_SHARED) { /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. */ get_area = shmem_get_unmapped_area; } /* Always treat pgoff as zero for anonymous memory. */ if (!file) pgoff = 0; if (get_area) { addr = get_area(file, addr, len, pgoff, flags); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) return addr; if (addr > TASK_SIZE - len) return -ENOMEM; if (offset_in_page(addr)) return -EINVAL; error = security_mmap_addr(addr); return error ? error : addr; } unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return mm_get_unmapped_area_vmflags(mm, file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. * @start_addr: The inclusive start user address. * @end_addr: The exclusive end user address. * * Returns: The first VMA within the provided range, %NULL otherwise. Assumes * start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { unsigned long index = start_addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, end_addr - 1); } EXPORT_SYMBOL(find_vma_intersection); /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check * @addr: The address * * Returns: The VMA associated with addr, or the next VMA. * May return %NULL in the case of no VMA at addr or above. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { unsigned long index = addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, ULONG_MAX); } EXPORT_SYMBOL(find_vma); /** * find_vma_prev() - Find the VMA for a given address, or the next vma and * set %pprev to the previous VMA, if any. * @mm: The mm_struct to check * @addr: The address * @pprev: The pointer to set to the previous VMA * * Note that RCU lock is missing here since the external mmap_lock() is used * instead. * * Returns: The VMA associated with @addr, or the next vma. * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, addr); vma = vma_iter_load(&vmi); *pprev = vma_prev(&vmi); if (!vma) vma = vma_next(&vmi); return vma; } /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; static int __init cmdline_parse_stack_guard_gap(char *p) { unsigned long val; char *endptr; val = simple_strtoul(p, &endptr, 10); if (!*endptr) stack_guard_gap = val << PAGE_SHIFT; return 1; } __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_upwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; addr &= PAGE_MASK; vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; if (!prev) return NULL; if (expand_stack_locked(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_downwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; unsigned long start; addr &= PAGE_MASK; vma = find_vma(mm, addr); if (!vma) return NULL; if (vma->vm_start <= addr) return vma; start = vma->vm_start; if (expand_stack_locked(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); return vma; } #endif #if defined(CONFIG_STACK_GROWSUP) #define vma_expand_up(vma,addr) expand_upwards(vma, addr) #define vma_expand_down(vma, addr) (-EFAULT) #else #define vma_expand_up(vma,addr) (-EFAULT) #define vma_expand_down(vma, addr) expand_downwards(vma, addr) #endif /* * expand_stack(): legacy interface for page faulting. Don't use unless * you have to. * * This is called with the mm locked for reading, drops the lock, takes * the lock for writing, tries to look up a vma again, expands it if * necessary, and downgrades the lock to reading again. * * If no vma is found or it can't be expanded, it returns NULL and has * dropped the lock. */ struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; mmap_read_unlock(mm); if (mmap_write_lock_killable(mm)) return NULL; vma = find_vma_prev(mm, addr, &prev); if (vma && vma->vm_start <= addr) goto success; if (prev && !vma_expand_up(prev, addr)) { vma = prev; goto success; } if (vma && !vma_expand_down(vma, addr)) goto success; mmap_write_unlock(mm); return NULL; success: mmap_write_downgrade(mm); return vma; } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. * @mm: The mm_struct * @start: The start address to munmap * @len: The length to be munmapped. * @uf: The userfaultfd list_head * * Return: 0 on success, error otherwise. */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { VMA_ITERATOR(vmi, mm, start); return do_vmi_munmap(&vmi, mm, start, len, uf, false); } int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); } EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { addr = untagged_addr(addr); return __vm_munmap(addr, len, true); } /* * Emulation of deprecated remap_file_pages() syscall. */ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long populate = 0; unsigned long ret = -EINVAL; struct file *file; vm_flags_t vm_flags; pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) return ret; start = start & PAGE_MASK; size = size & PAGE_MASK; if (start + size <= start) return ret; /* Does pgoff wrap? */ if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; if (mmap_read_lock_killable(mm)) return -EINTR; /* * Look up VMA under read lock first so we can perform the security * without holding locks (which can be problematic). We reacquire a * write lock later and check nothing changed underneath us. */ vma = vma_lookup(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) { mmap_read_unlock(mm); return -EINVAL; } prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; /* Save vm_flags used to calculate prot and flags, and recheck later. */ vm_flags = vma->vm_flags; file = get_file(vma->vm_file); mmap_read_unlock(mm); /* Call outside mmap_lock to be consistent with other callers. */ ret = security_mmap_file(file, prot, flags); if (ret) { fput(file); return ret; } ret = -EINVAL; /* OK security check passed, take write lock + let it rip. */ if (mmap_write_lock_killable(mm)) { fput(file); return -EINTR; } vma = vma_lookup(mm, start); if (!vma) goto out; /* Make sure things didn't change under us. */ if (vma->vm_flags != vm_flags) goto out; if (vma->vm_file != file) goto out; if (start + size > vma->vm_end) { VMA_ITERATOR(vmi, mm, vma->vm_end); struct vm_area_struct *next, *prev = vma; for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) goto out; if (next->vm_flags != vma->vm_flags) goto out; if (start + size <= next->vm_end) break; prev = next; } if (!next) goto out; } ret = do_mmap(vma->vm_file, start, size, prot, flags, 0, pgoff, &populate, NULL); out: mmap_write_unlock(mm); fput(file); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) ret = 0; return ret; } int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, addr); len = PAGE_ALIGN(request); if (len < request) return -ENOMEM; if (!len) return 0; /* Until we need other flags, refuse anything except VM_EXEC. */ if ((vm_flags & (~VM_EXEC)) != 0) return -EINVAL; if (mmap_write_lock_killable(mm)) return -EINTR; ret = check_brk_limits(addr, len); if (ret) goto limits_failed; ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; vma = vma_prev(&vmi); ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; munmap_failed: limits_failed: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(vm_brk_flags); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); mmap_read_lock(mm); arch_exit_mmap(mm); vma = vma_next(&vmi); if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); mmap_write_lock(mm); goto destroy; } flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); mmap_read_unlock(mm); /* * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); /* * Walk the list again, actually closing and freeing it, with preemption * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ vma_iter_set(&vmi, vma->vm_end); do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma_mark_detached(vma); remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } /* * Return true if the calling process may expand its vm space by the passed * number of pages */ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) { if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { /* Workaround for Valgrind */ if (rlimit(RLIMIT_DATA) == 0 && mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) return true; pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA), ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); if (!ignore_rlimit_data) return false; } return true; } void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); if (is_exec_mapping(flags)) mm->exec_vm += npages; else if (is_stack_mapping(flags)) mm->stack_vm += npages; else if (is_data_mapping(flags)) mm->data_vm += npages; } static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Close hook, called for unmap() and on the old vma for mremap(). * * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct vm_area_struct *vma) { const struct vm_special_mapping *sm = vma->vm_private_data; if (sm->close) sm->close(sm, vma); } static const char *special_mapping_name(struct vm_area_struct *vma) { return ((struct vm_special_mapping *)vma->vm_private_data)->name; } static int special_mapping_mremap(struct vm_area_struct *new_vma) { struct vm_special_mapping *sm = new_vma->vm_private_data; if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; if (sm->mremap) return sm->mremap(sm, new_vma); return 0; } static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) { /* * Forbid splitting special mappings - kernel has expectations over * the number of pages in mapping. Together with VM_DONTEXPAND * the size of vma should stay the same over the special mapping's * lifetime. */ return -EINVAL; } static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, /* vDSO code relies that VVAR can't be accessed remotely */ .access = NULL, .may_split = special_mapping_split, }; static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; struct vm_special_mapping *sm = vma->vm_private_data; if (sm->fault) return sm->fault(sm, vmf->vma, vmf); pages = sm->pages; for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; if (*pages) { struct page *page = *pages; get_page(page); vmf->page = page; return 0; } return VM_FAULT_SIGBUS; } static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, vm_flags_t vm_flags, void *priv, const struct vm_operations_struct *ops) { int ret; struct vm_area_struct *vma; vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); vma_set_range(vma, addr, addr + len, 0); vm_flags_init(vma, (vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) goto out; vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); return vma; out: vm_area_free(vma); return ERR_PTR(ret); } bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm) { return vma->vm_private_data == sm && vma->vm_ops == &special_mapping_vmops; } /* * Called with mm->mmap_lock held for writing. * Insert a new vma covering the given region, with the given flags. * Its pages are supplied by the given array of struct page *. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. * The region past the last page supplied will always produce SIGBUS. * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ struct vm_area_struct *_install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, vm_flags_t vm_flags, const struct vm_special_mapping *spec) { return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, &special_mapping_vmops); } #ifdef CONFIG_SYSCTL #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif static const struct ctl_table mmap_table[] = { { .procname = "max_map_count", .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS { .procname = "mmap_rnd_bits", .data = &mmap_rnd_bits, .maxlen = sizeof(mmap_rnd_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_bits_min, .extra2 = (void *)&mmap_rnd_bits_max, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS { .procname = "mmap_rnd_compat_bits", .data = &mmap_rnd_compat_bits, .maxlen = sizeof(mmap_rnd_compat_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif }; #endif /* CONFIG_SYSCTL */ /* * initialise the percpu counter for VM, initialise VMA state. */ void __init mmap_init(void) { int ret; ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); #ifdef CONFIG_SYSCTL register_sysctl_init("vm", mmap_table); #endif vma_state_init(); } /* * Initialise sysctl_user_reserve_kbytes. * * This is intended to prevent a user from starting a single memory hogging * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER * mode. * * The default value is min(3% of free memory, 128MB) * 128MB is enough to recover with sshd/login, bash, and top/kill. */ static int init_user_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K); return 0; } subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. * * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin * to log in and kill a memory hogging process. * * Systems with more than 256MB will reserve 8MB, enough to recover * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will * only reserve 3% of free pages by default. */ static int init_admin_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K); return 0; } subsys_initcall(init_admin_reserve); /* * Reinititalise user and admin reserves if memory is added or removed. * * The default user reserve max is 128MB, and the default max for the * admin reserve is 8MB. These are usually, but not always, enough to * enable recovery from a memory hogging process using login/sshd, a shell, * and tools like top. It may make sense to increase or even disable the * reserve depending on the existence of swap or variations in the recovery * tools. So, the admin may have changed them. * * If memory is added and the reserves have been eliminated or increased above * the default max, then we'll trust the admin. * * If memory is removed and there isn't enough free memory, then we * need to reset the reserves. * * Otherwise keep the reserve set by the admin. */ static int reserve_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { unsigned long tmp, free_kbytes; switch (action) { case MEM_ONLINE: /* Default max is 128MB. Leave alone if modified by operator. */ tmp = sysctl_user_reserve_kbytes; if (tmp > 0 && tmp < SZ_128K) init_user_reserve(); /* Default max is 8MB. Leave alone if modified by operator. */ tmp = sysctl_admin_reserve_kbytes; if (tmp > 0 && tmp < SZ_8K) init_admin_reserve(); break; case MEM_OFFLINE: free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); pr_info("vm.user_reserve_kbytes reset to %lu\n", sysctl_user_reserve_kbytes); } if (sysctl_admin_reserve_kbytes > free_kbytes) { init_admin_reserve(); pr_info("vm.admin_reserve_kbytes reset to %lu\n", sysctl_admin_reserve_kbytes); } break; default: break; } return NOTIFY_OK; } static int __meminit init_reserve_notifier(void) { if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; } subsys_initcall(init_reserve_notifier); /* * Obtain a read lock on mm->mmap_lock, if the specified address is below the * start of the VMA, the intent is to perform a write, and it is a * downward-growing stack, then attempt to expand the stack to contain it. * * This function is intended only for obtaining an argument page from an ELF * image, and is almost certainly NOT what you want to use for any other * purpose. * * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the * VMA referenced must not be linked in any user-visible tree, i.e. it must be a * new VMA being mapped. * * The function assumes that addr is either contained within the VMA or below * it, and makes no attempt to validate this value beyond that. * * Returns true if the read lock was obtained and a stack was perhaps expanded, * false if the stack expansion failed. * * On stack expansion the function temporarily acquires an mmap write lock * before downgrading it. */ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *new_vma, unsigned long addr, bool write) { if (!write || addr >= new_vma->vm_start) { mmap_read_lock(mm); return true; } if (!(new_vma->vm_flags & VM_GROWSDOWN)) return false; mmap_write_lock(mm); if (expand_downwards(new_vma, addr)) { mmap_write_unlock(mm); return false; } mmap_write_downgrade(mm); return true; } __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp; int retval; unsigned long charge = 0; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, 0); if (mmap_write_lock_killable(oldmm)) return -EINTR; flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ dup_mm_exe_file(mm, oldmm); mm->total_vm = oldmm->total_vm; mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; /* Use __mt_dup() to efficiently build an identical maple tree. */ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); if (unlikely(retval)) goto out; mt_clear_in_rcu(vmi.mas.tree); for_each_vma(vmi, mpnt) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, mpnt->vm_end, GFP_KERNEL); if (retval) goto loop_out; vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; /* * Don't duplicate many vmas if we've been oom-killed (for * example) */ if (fatal_signal_pending(current)) { retval = -EINTR; goto loop_out; } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; retval = dup_userfaultfd(tmp, &uf); if (retval) goto fail_nomem_anon_vma_fork; if (tmp->vm_flags & VM_WIPEONFORK) { /* * VM_WIPEONFORK gets a clean slate in the child. * Don't prepare anon_vma until fault since we don't * copy page for current vma. */ tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; vm_flags_clear(tmp, VM_LOCKED_MASK); /* * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); /* * Link the vma into the MT. After using __mt_dup(), memory * allocation is not necessary here, so it cannot fail. */ vma_iter_bulk_store(&vmi, tmp); mm->map_count++; if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; get_file(file); i_mmap_lock_write(mapping); if (vma_is_shared_maywrite(tmp)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); if (retval) { mpnt = vma_next(&vmi); goto loop_out; } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else { /* * The entire maple tree has already been duplicated. If the * mmap duplication fails, mark the failure point with * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, * stop releasing VMAs that have not been duplicated after this * point. */ if (mpnt) { mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ set_bit(MMF_OOM_SKIP, &mm->flags); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ set_bit(MMF_UNSTABLE, &mm->flags); } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); mmap_write_unlock(oldmm); if (!retval) dup_userfaultfd_complete(&uf); else dup_userfaultfd_fail(&uf); return retval; fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto loop_out; }
3 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 // SPDX-License-Identifier: GPL-2.0 /* * drivers/power/process.c - Functions for starting/stopping processes on * suspend transitions. * * Originally from swsusp. */ #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> #include <linux/workqueue.h> #include <linux/kmod.h> #include <trace/events/power.h> #include <linux/cpuset.h> /* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { const char *what = user_only ? "user space processes" : "remaining freezable tasks"; struct task_struct *g, *p; unsigned long end_time; unsigned int todo; bool wq_busy = false; ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; pr_info("Freezing %s\n", what); start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); if (!user_only) freeze_workqueues_begin(); while (true) { todo = 0; read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; todo++; } read_unlock(&tasklist_lock); if (!user_only) { wq_busy = freeze_workqueues_busy(); todo += wq_busy; } if (!todo || time_after(jiffies, end_time)) break; if (pm_wakeup_pending()) { wakeup = true; break; } /* * We need to retry, but first give the freezing tasks some * time to enter the refrigerator. Start with an initial * 1 ms sleep followed by exponential backoff until 8 ms. */ usleep_range(sleep_usecs / 2, sleep_usecs); if (sleep_usecs < 8 * USEC_PER_MSEC) sleep_usecs *= 2; } end = ktime_get_boottime(); elapsed = ktime_sub(end, start); elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_err("Freezing %s %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", what, wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (wq_busy) show_freezable_workqueues(); if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && freezing(p) && !frozen(p)) sched_show_task(p); } read_unlock(&tasklist_lock); } } else { pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n", what, elapsed_msecs / 1000, elapsed_msecs % 1000); } return todo ? -EBUSY : 0; } /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls * freeze_processes must later call thaw_processes. * * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_processes(void) { int error; error = __usermodehelper_disable(UMH_FREEZING); if (error) return error; /* Make sure this task doesn't get frozen */ current->flags |= PF_SUSPEND_TASK; if (!pm_freezing) static_branch_inc(&freezer_active); pm_wakeup_clear(0); pm_freezing = true; error = try_to_freeze_tasks(true); if (!error) __usermodehelper_set_disable_depth(UMH_DISABLED); BUG_ON(in_atomic()); /* * Now that the whole userspace is frozen we need to disable * the OOM killer to disallow any further interference with * killable tasks. There is no guarantee oom victims will * ever reach a point they go away we have to wait with a timeout. */ if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) error = -EBUSY; if (error) thaw_processes(); return error; } /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * * On success, returns 0. On failure, -errno and only the kernel threads are * thawed, so as to give a chance to the caller to do additional cleanups * (if any) before thawing the userspace tasks. So, it is the responsibility * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { int error; pm_nosig_freezing = true; error = try_to_freeze_tasks(false); BUG_ON(in_atomic()); if (error) thaw_kernel_threads(); return error; } void thaw_processes(void) { struct task_struct *g, *p; struct task_struct *curr = current; trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) static_branch_dec(&freezer_active); pm_freezing = false; pm_nosig_freezing = false; oom_killer_enable(); pr_info("Restarting tasks: Starting\n"); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); curr->flags &= ~PF_SUSPEND_TASK; usermodehelper_enable(); schedule(); pr_info("Restarting tasks: Done\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) { struct task_struct *g, *p; pm_nosig_freezing = false; pr_info("Restarting kernel threads ...\n"); thaw_workqueues(); read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p->flags & PF_KTHREAD) __thaw_task(p); } read_unlock(&tasklist_lock); schedule(); pr_info("Done restarting kernel threads.\n"); }
932 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio PCI driver - common functionality for all device versions * * This module allows virtio devices to be used over a virtual PCI device. * This can be used with QEMU based VMMs like KVM or Xen. * * Copyright IBM Corp. 2007 * Copyright Red Hat, Inc. 2014 * * Authors: * Anthony Liguori <aliguori@us.ibm.com> * Rusty Russell <rusty@rustcorp.com.au> * Michael S. Tsirkin <mst@redhat.com> */ #include "virtio_pci_common.h" static bool force_legacy = false; #if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) module_param(force_legacy, bool, 0444); MODULE_PARM_DESC(force_legacy, "Force legacy mode for transitional virtio 1 devices"); #endif bool vp_is_avq(struct virtio_device *vdev, unsigned int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) return false; return index == vp_dev->admin_vq.vq_index; } /* wait for pending irq handlers */ void vp_synchronize_vectors(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i; if (vp_dev->intx_enabled) synchronize_irq(vp_dev->pci_dev->irq); for (i = 0; i < vp_dev->msix_vectors; ++i) synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i)); } /* the notify function used when creating a virt queue */ bool vp_notify(struct virtqueue *vq) { /* we write the queue's selector into the notification register to * signal the other end */ iowrite16(vq->index, (void __iomem *)vq->priv); return true; } /* Notify all slow path virtqueues on an interrupt. */ static void vp_vring_slow_path_interrupt(int irq, struct virtio_pci_device *vp_dev) { struct virtio_pci_vq_info *info; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->slow_virtqueues, node) vring_interrupt(irq, info->vq); spin_unlock_irqrestore(&vp_dev->lock, flags); } /* Handle a configuration change: Tell driver if it wants to know. */ static irqreturn_t vp_config_changed(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; virtio_config_changed(&vp_dev->vdev); vp_vring_slow_path_interrupt(irq, vp_dev); return IRQ_HANDLED; } /* Notify all virtqueues on an interrupt. */ static irqreturn_t vp_vring_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; struct virtio_pci_vq_info *info; irqreturn_t ret = IRQ_NONE; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->virtqueues, node) { if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) ret = IRQ_HANDLED; } spin_unlock_irqrestore(&vp_dev->lock, flags); return ret; } /* A small wrapper to also acknowledge the interrupt when it's handled. * I really need an EIO hook for the vring so I can ack the interrupt once we * know that we'll be handling the IRQ but before we invoke the callback since * the callback may notify the host which results in the host attempting to * raise an interrupt that we would then mask once we acknowledged the * interrupt. */ static irqreturn_t vp_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; u8 isr; /* reading the ISR has the effect of also clearing it so it's very * important to save off the value. */ isr = ioread8(vp_dev->isr); /* It's definitely not us if the ISR was not high */ if (!isr) return IRQ_NONE; /* Configuration change? Tell driver if it wants to know. */ if (isr & VIRTIO_PCI_ISR_CONFIG) vp_config_changed(irq, opaque); return vp_vring_interrupt(irq, opaque); } static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, bool per_vq_vectors, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned int flags = PCI_IRQ_MSIX; unsigned int i, v; int err = -ENOMEM; vp_dev->msix_vectors = nvectors; vp_dev->msix_names = kmalloc_array(nvectors, sizeof(*vp_dev->msix_names), GFP_KERNEL); if (!vp_dev->msix_names) goto error; vp_dev->msix_affinity_masks = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks), GFP_KERNEL); if (!vp_dev->msix_affinity_masks) goto error; for (i = 0; i < nvectors; ++i) if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], GFP_KERNEL)) goto error; if (!per_vq_vectors) desc = NULL; if (desc) { flags |= PCI_IRQ_AFFINITY; desc->pre_vectors++; /* virtio config vector */ } err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors, nvectors, flags, desc); if (err < 0) goto error; vp_dev->msix_enabled = 1; /* Set the vector used for configuration */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-config", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_config_changed, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; v = vp_dev->config_vector(vp_dev, v); /* Verify we had enough resources to assign the vector */ if (v == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto error; } if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-virtqueues", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; } return 0; error: return err; } static bool vp_is_slow_path_vector(u16 msix_vec) { return msix_vec == VP_MSIX_CONFIG_VECTOR; } static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx, u16 msix_vec, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); struct virtqueue *vq; unsigned long flags; /* fill out our structure that represents an active queue */ if (!info) return ERR_PTR(-ENOMEM); vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, msix_vec); if (IS_ERR(vq)) goto out_info; info->vq = vq; if (callback) { spin_lock_irqsave(&vp_dev->lock, flags); if (!vp_is_slow_path_vector(msix_vec)) list_add(&info->node, &vp_dev->virtqueues); else list_add(&info->node, &vp_dev->slow_virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); } else { INIT_LIST_HEAD(&info->node); } *p_info = info; return vq; out_info: kfree(info); return vq; } static void vp_del_vq(struct virtqueue *vq, struct virtio_pci_vq_info *info) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); unsigned long flags; /* * If it fails during re-enable reset vq. This way we won't rejoin * info->node to the queue. Prevent unexpected irqs. */ if (!vq->reset) { spin_lock_irqsave(&vp_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vp_dev->lock, flags); } vp_dev->del_vq(info); kfree(info); } /* the config->del_vqs() implementation */ void vp_del_vqs(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq, *n; int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { info = vp_is_avq(vdev, vq->index) ? vp_dev->admin_vq.info : vp_dev->vqs[vq->index]; if (vp_dev->per_vq_vectors) { int v = info->msix_vector; if (v != VIRTIO_MSI_NO_VECTOR && !vp_is_slow_path_vector(v)) { int irq = pci_irq_vector(vp_dev->pci_dev, v); irq_update_affinity_hint(irq, NULL); free_irq(irq, vq); } } vp_del_vq(vq, info); } vp_dev->per_vq_vectors = false; if (vp_dev->intx_enabled) { free_irq(vp_dev->pci_dev->irq, vp_dev); vp_dev->intx_enabled = 0; } for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev); if (vp_dev->msix_affinity_masks) { for (i = 0; i < vp_dev->msix_vectors; i++) free_cpumask_var(vp_dev->msix_affinity_masks[i]); } if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR); pci_free_irq_vectors(vp_dev->pci_dev); vp_dev->msix_enabled = 0; } vp_dev->msix_vectors = 0; vp_dev->msix_used_vectors = 0; kfree(vp_dev->msix_names); vp_dev->msix_names = NULL; kfree(vp_dev->msix_affinity_masks); vp_dev->msix_affinity_masks = NULL; kfree(vp_dev->vqs); vp_dev->vqs = NULL; } enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_EACH, VP_VQ_VECTOR_POLICY_SHARED_SLOW, VP_VQ_VECTOR_POLICY_SHARED, }; static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, bool slow_path, int *allocated_vectors, enum vp_vq_vector_policy vector_policy, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; u16 msix_vec; int err; if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH || (vector_policy == VP_VQ_VECTOR_POLICY_SHARED_SLOW && !slow_path)) msix_vec = (*allocated_vectors)++; else if (vector_policy != VP_VQ_VECTOR_POLICY_EACH && slow_path) msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec, p_info); if (IS_ERR(vq)) return vq; if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || msix_vec == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(msix_vec)) return vq; /* allocate per-vq irq if available and necessary */ snprintf(vp_dev->msix_names[msix_vec], sizeof(*vp_dev->msix_names), "%s-%s", dev_name(&vp_dev->vdev.dev), name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupt, 0, vp_dev->msix_names[msix_vec], vq); if (err) { vp_del_vq(vq, *p_info); return ERR_PTR(err); } return vq; } static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], enum vp_vq_vector_policy vector_policy, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; struct virtqueue *vq; bool per_vq_vectors; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto error_find; } per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ nvectors = 1; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (vqi->name && vqi->callback) ++nvectors; } if (avq_num && vector_policy == VP_VQ_VECTOR_POLICY_EACH) ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ nvectors = 2; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, desc); if (err) goto error_find; vp_dev->per_vq_vectors = per_vq_vectors; allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, false, &allocated_vectors, vector_policy, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_find_one_vq_msix(vdev, avq->vq_index, vp_modern_avq_done, avq->name, false, true, &allocated_vectors, vector_policy, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto error_find; } return 0; error_find: vp_del_vqs(vdev); return err; } static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[]) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; int i, err, queue_idx = 0; struct virtqueue *vq; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto out_del_vqs; } err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vp_dev); if (err) goto out_del_vqs; vp_dev->intx_enabled = 1; vp_dev->per_vq_vectors = false; for (i = 0; i < nvqs; ++i) { struct virtqueue_info *vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, VIRTIO_MSI_NO_VECTOR, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto out_del_vqs; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_setup_vq(vdev, queue_idx++, vp_modern_avq_done, avq->name, false, VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto out_del_vqs; } return 0; out_del_vqs: vp_del_vqs(vdev); return err; } /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], struct irq_affinity *desc) { int err; /* Try MSI-X with one vector per queue. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; /* Fallback: MSI-X with one shared vector for config and * slow path queues, one vector per queue for the rest. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED_SLOW, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED, desc); if (!err) return 0; /* Is there an interrupt? If not give up. */ if (!(to_vp_device(vdev)->pci_dev->irq)) return err; /* Finally fall back to regular interrupts. */ return vp_find_vqs_intx(vdev, nvqs, vqs, vqs_info); } const char *vp_bus_name(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); return pci_name(vp_dev->pci_dev); } /* Setup the affinity for a virtqueue: * - force the affinity for per vq vector * - OR over all affinities for shared MSI * - ignore the affinity request if we're using INTX */ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; struct cpumask *mask; unsigned int irq; if (!vq->callback) return -EINVAL; if (vp_dev->msix_enabled) { mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); if (!cpu_mask) irq_update_affinity_hint(irq, NULL); else { cpumask_copy(mask, cpu_mask); irq_set_affinity_and_hint(irq, mask); } } return 0; } const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!vp_dev->per_vq_vectors || vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(vp_dev->vqs[index]->msix_vector)) return NULL; return pci_irq_get_affinity(vp_dev->pci_dev, vp_dev->vqs[index]->msix_vector); } #ifdef CONFIG_PM_SLEEP static int virtio_pci_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = virtio_device_freeze(&vp_dev->vdev); if (!ret) pci_disable_device(pci_dev); return ret; } static int virtio_pci_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = pci_enable_device(pci_dev); if (ret) return ret; pci_set_master(pci_dev); return virtio_device_restore(&vp_dev->vdev); } static bool vp_supports_pm_no_reset(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); u16 pmcsr; if (!pci_dev->pm_cap) return false; pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { dev_err(dev, "Unable to query pmcsr"); return false; } return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET; } static int virtio_pci_suspend(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev); } static int virtio_pci_resume(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev); } static const struct dev_pm_ops virtio_pci_pm_ops = { .suspend = virtio_pci_suspend, .resume = virtio_pci_resume, .freeze = virtio_pci_freeze, .thaw = virtio_pci_restore, .poweroff = virtio_pci_freeze, .restore = virtio_pci_restore, }; #endif /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ static const struct pci_device_id virtio_pci_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, { 0 } }; MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); static void virtio_pci_release_dev(struct device *_d) { struct virtio_device *vdev = dev_to_virtio(_d); struct virtio_pci_device *vp_dev = to_vp_device(vdev); /* As struct device is a kobject, it's not safe to * free the memory (including the reference counter itself) * until it's release callback. */ kfree(vp_dev); } static int virtio_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) { struct virtio_pci_device *vp_dev, *reg_dev = NULL; int rc; /* allocate our structure and fill it out */ vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); if (!vp_dev) return -ENOMEM; pci_set_drvdata(pci_dev, vp_dev); vp_dev->vdev.dev.parent = &pci_dev->dev; vp_dev->vdev.dev.release = virtio_pci_release_dev; vp_dev->pci_dev = pci_dev; INIT_LIST_HEAD(&vp_dev->virtqueues); INIT_LIST_HEAD(&vp_dev->slow_virtqueues); spin_lock_init(&vp_dev->lock); /* enable the device */ rc = pci_enable_device(pci_dev); if (rc) goto err_enable_device; if (force_legacy) { rc = virtio_pci_legacy_probe(vp_dev); /* Also try modern mode if we can't map BAR0 (no IO space). */ if (rc == -ENODEV || rc == -ENOMEM) rc = virtio_pci_modern_probe(vp_dev); if (rc) goto err_probe; } else { rc = virtio_pci_modern_probe(vp_dev); if (rc == -ENODEV) rc = virtio_pci_legacy_probe(vp_dev); if (rc) goto err_probe; } pci_set_master(pci_dev); rc = register_virtio_device(&vp_dev->vdev); reg_dev = vp_dev; if (rc) goto err_register; return 0; err_register: if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); err_probe: pci_disable_device(pci_dev); err_enable_device: if (reg_dev) put_device(&vp_dev->vdev.dev); else kfree(vp_dev); return rc; } static void virtio_pci_remove(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev); /* * Device is marked broken on surprise removal so that virtio upper * layers can abort any ongoing operation. */ if (!pci_device_is_present(pci_dev)) virtio_break_device(&vp_dev->vdev); pci_disable_sriov(pci_dev); unregister_virtio_device(&vp_dev->vdev); if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); pci_disable_device(pci_dev); put_device(dev); } static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct virtio_device *vdev = &vp_dev->vdev; int ret; if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) return -EBUSY; if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV)) return -EINVAL; if (pci_vfs_assigned(pci_dev)) return -EPERM; if (num_vfs == 0) { pci_disable_sriov(pci_dev); return 0; } ret = pci_enable_sriov(pci_dev, num_vfs); if (ret < 0) return ret; return num_vfs; } static void virtio_pci_reset_prepare(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret = 0; ret = virtio_device_reset_prepare(&vp_dev->vdev); if (ret) { if (ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset prepare failure: %d", ret); return; } if (pci_is_enabled(pci_dev)) pci_disable_device(pci_dev); } static void virtio_pci_reset_done(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; if (pci_is_enabled(pci_dev)) return; ret = pci_enable_device(pci_dev); if (!ret) { pci_set_master(pci_dev); ret = virtio_device_reset_done(&vp_dev->vdev); } if (ret && ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset done failure: %d", ret); } static const struct pci_error_handlers virtio_pci_err_handler = { .reset_prepare = virtio_pci_reset_prepare, .reset_done = virtio_pci_reset_done, }; static struct pci_driver virtio_pci_driver = { .name = "virtio-pci", .id_table = virtio_pci_id_table, .probe = virtio_pci_probe, .remove = virtio_pci_remove, #ifdef CONFIG_PM_SLEEP .driver.pm = &virtio_pci_pm_ops, #endif .sriov_configure = virtio_pci_sriov_configure, .err_handler = &virtio_pci_err_handler, }; struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev) { struct virtio_pci_device *pf_vp_dev; pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver); if (IS_ERR(pf_vp_dev)) return NULL; return &pf_vp_dev->vdev; } module_pci_driver(virtio_pci_driver); MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); MODULE_DESCRIPTION("virtio-pci"); MODULE_LICENSE("GPL"); MODULE_VERSION("1");
2 2 3 3 3 3 1 1 1 5 5 3 4 3 3 4 4 3 4 2 6 6 1 5 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 // SPDX-License-Identifier: GPL-2.0-only /* * Binary Increase Congestion control for TCP * Home page: * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC * This is from the implementation of BICTCP in * Lison-Xu, Kahaled Harfoush, and Injong Rhee. * "Binary Increase Congestion Control for Fast, Long Distance * Networks" in InfoComm 2004 * Available from: * http://netsrv.csc.ncsu.edu/export/bitcp.pdf * * Unless BIC is enabled and congestion window is large * this behaves the same as the original Reno. */ #include <linux/mm.h> #include <linux/module.h> #include <net/tcp.h> #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta */ #define BICTCP_B 4 /* * In binary search, * go to point (max+min)/N */ static int fast_convergence = 1; static int max_increment = 16; static int low_window = 14; static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ static int initial_ssthresh; static int smooth_part = 20; module_param(fast_convergence, int, 0644); MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); module_param(max_increment, int, 0644); MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); module_param(low_window, int, 0644); MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(smooth_part, int, 0644); MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 epoch_start; /* beginning of an epoch */ #define ACK_RATIO_SHIFT 4 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ }; static inline void bictcp_reset(struct bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; ca->epoch_start = 0; ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } static void bictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); if (initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } /* * Compute congestion window to use. */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { if (ca->last_cwnd == cwnd && (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; ca->last_cwnd = cwnd; ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) /* record the beginning of an epoch */ ca->epoch_start = tcp_jiffies32; /* start off normal */ if (cwnd <= low_window) { ca->cnt = cwnd; return; } /* binary increase */ if (cwnd < ca->last_max_cwnd) { __u32 dist = (ca->last_max_cwnd - cwnd) / BICTCP_B; if (dist > max_increment) /* linear increase */ ca->cnt = cwnd / max_increment; else if (dist <= 1U) /* binary search increase */ ca->cnt = (cwnd * smooth_part) / BICTCP_B; else /* binary search increase */ ca->cnt = cwnd / dist; } else { /* slow start AMD linear increase */ if (cwnd < ca->last_max_cwnd + BICTCP_B) /* slow start */ ca->cnt = (cwnd * smooth_part) / BICTCP_B; else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) /* slow start */ ca->cnt = (cwnd * (BICTCP_B-1)) / (cwnd - ca->last_max_cwnd); else /* linear increase */ ca->cnt = cwnd / max_increment; } /* if in slow start or link utilization is very low */ if (ca->last_max_cwnd == 0) { if (ca->cnt > 20) /* increase cwnd 5% per RTT */ ca->cnt = 20; } ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; if (ca->cnt == 0) /* cannot be zero */ ca->cnt = 1; } static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } bictcp_update(ca, tcp_snd_cwnd(tp)); tcp_cong_avoid_ai(tp, ca->cnt, acked); } /* * behave like Reno until low_window is reached, * then increase congestion window slowly */ static u32 bictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ /* Wmax and fast convergence */ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) / (2 * BICTCP_BETA_SCALE); else ca->last_max_cwnd = tcp_snd_cwnd(tp); if (tcp_snd_cwnd(tp) <= low_window) return max(tcp_snd_cwnd(tp) >> 1U, 2U); else return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); } static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) bictcp_reset(inet_csk_ca(sk)); } /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); ca->delayed_ack += sample->pkts_acked - (ca->delayed_ack >> ACK_RATIO_SHIFT); } } static struct tcp_congestion_ops bictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = tcp_reno_undo_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "bic", }; static int __init bictcp_register(void) { BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&bictcp); } static void __exit bictcp_unregister(void) { tcp_unregister_congestion_control(&bictcp); } module_init(bictcp_register); module_exit(bictcp_unregister); MODULE_AUTHOR("Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("BIC TCP");
11 11 10 10 9 9 9 4 4 3 1 1 1 12 12 10 10 10 1 2 2 2 6 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> #include <net/inet_sock.h> #include <net/raw.h> #include <net/rawv6.h> #ifdef pr_fmt # undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt static struct raw_hashinfo * raw_get_hashinfo(const struct inet_diag_req_v2 *r) { if (r->sdiag_family == AF_INET) { return &raw_v4_hashinfo; #if IS_ENABLED(CONFIG_IPV6) } else if (r->sdiag_family == AF_INET6) { return &raw_v6_hashinfo; #endif } else { return ERR_PTR(-EINVAL); } } /* * Due to requirement of not breaking user API we can't simply * rename @pad field in inet_diag_req_v2 structure, instead * use helper to figure it out. */ static bool raw_lookup(struct net *net, const struct sock *sk, const struct inet_diag_req_v2 *req) { struct inet_diag_req_raw *r = (void *)req; if (r->sdiag_family == AF_INET) return raw_v4_match(net, sk, r->sdiag_raw_protocol, r->id.idiag_dst[0], r->id.idiag_src[0], r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else return raw_v6_match(net, sk, r->sdiag_raw_protocol, (const struct in6_addr *)r->id.idiag_src, (const struct in6_addr *)r->id.idiag_dst, r->id.idiag_if, 0); #endif return false; } static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct hlist_head *hlist; struct sock *sk; int slot; if (IS_ERR(hashinfo)) return ERR_CAST(hashinfo); rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill * diag message to be reported, so * caller should call sock_put then. */ if (refcount_inc_not_zero(&sk->sk_refcnt)) goto out_unlock; } } } sk = ERR_PTR(-ENOENT); out_unlock: rcu_read_unlock(); return sk; } static int raw_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct sk_buff *in_skb = cb->skb; struct sk_buff *rep; struct sock *sk; struct net *net; int err; net = sock_net(in_skb->sk); sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) { sock_put(sk); return -ENOMEM; } err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); sock_put(sk); if (err < 0) { kfree_skb(rep); return err; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); return err; } static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; struct nlattr *bc; if (IS_ERR(hashinfo)) return; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; rcu_read_lock(); for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { num = 0; hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) goto out_unlock; next: num++; } } out_unlock: rcu_read_unlock(); cb->args[0] = slot; cb->args[1] = num; } static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int raw_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *r) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } #endif static const struct inet_diag_handler raw_diag_handler = { .owner = THIS_MODULE, .dump = raw_diag_dump, .dump_one = raw_diag_dump_one, .idiag_get_info = raw_diag_get_info, .idiag_type = IPPROTO_RAW, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = raw_diag_destroy, #endif }; static void __always_unused __check_inet_diag_req_raw(void) { /* * Make sure the two structures are identical, * except the @pad field. */ #define __offset_mismatch(m1, m2) \ (offsetof(struct inet_diag_req_v2, m1) != \ offsetof(struct inet_diag_req_raw, m2)) BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) != sizeof(struct inet_diag_req_raw)); BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family)); BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext)); BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states)); BUILD_BUG_ON(__offset_mismatch(id, id)); #undef __offset_mismatch } static int __init raw_diag_init(void) { return inet_diag_register(&raw_diag_handler); } static void __exit raw_diag_exit(void) { inet_diag_unregister(&raw_diag_handler); } module_init(raw_diag_init); module_exit(raw_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
114 6 113 12 58 112 6 110 27 20 6 2 1 4 27 27 33 39 39 20 18 33 22 13 3 2 4 9 4 4 20 20 13 2 13 6 9 9 1 5 372 368 360 45 360 360 19 1 14 3 12 5 13 4 13 4 8 9 14 3 10 10 10 8 8 596 659 1 1 1 655 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder modified to support * virtual tunnel interface * * Authors: * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 */ /* This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <linux/icmpv6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> static struct rtnl_link_ops vti_link_ops __read_mostly; static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; if (update_skb_dev) skb->dev = tunnel->dev; return xfrm_input(skb, nexthdr, spi, encap_type); } return -EINVAL; drop: kfree_skb(skb); return 0; } static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return vti_input(skb, nexthdr, spi, encap_type, false); } static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); } static int vti_rcv_proto(struct sk_buff *skb) { return vti_rcv(skb, 0, false); } static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; if (!tunnel) return 1; dev = tunnel->dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } x = xfrm_input_state(skb); inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } family = inner_mode->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); skb->mark = orig_mark; if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return 0; } static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) { xfrm_address_t *daddr = (xfrm_address_t *)&dst; xfrm_address_t *saddr = (xfrm_address_t *)&src; /* if there is no transform then this tunnel is not functional. * Or if the xfrm is not mode tunnel. */ if (!x || x->props.mode != XFRM_MODE_TUNNEL || x->props.family != AF_INET) return false; if (!dst) return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) return false; return true; } static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; int err; int mtu; if (!dst) { switch (skb->protocol) { case htons(ETH_P_IP): { struct rtable *rt; fl->u.ip4.flowi4_oif = dev->ifindex; fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst = &rt->dst; skb_dst_set(skb, dst); break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): fl->u.ip6.flowi6_oif = dev->ifindex; fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); if (dst->error) { dst_release(dst); dst = NULL; DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } skb_dst_set(skb, dst); break; #endif default: DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } } dst_hold(dst); dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } if (dst->flags & DST_XFRM_QUEUE) goto xmit; if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { DEV_STATS_INC(dev, tx_carrier_errors); dst_release(dst); goto tx_error_icmp; } tdev = dst_dev(dst); if (tdev == dev) { dst_release(dst); DEV_STATS_INC(dev, collisions); goto tx_error; } mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } else { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } dst_release(dst); goto tx_error; } xmit: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst_dev(skb); err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } /* This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; if (!pskb_inet_may_pull(skb)) goto tx_err; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; default: goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) { __be32 spi; __u32 mark; struct xfrm_state *x; struct ip_tunnel *tunnel; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct ip_comp_hdr *ipch; struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!tunnel) return -1; mark = be32_to_cpu(tunnel->parms.o_key); switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err = 0; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || p->iph.ihl != 5) return -EINVAL; } if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || !ip_tunnel_flags_is_be16_compat(p->o_flags)) return -EOVERFLOW; if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY)) p->i_key = 0; if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY)) p->o_key = 0; __set_bit(IP_TUNNEL_VTI_BIT, flags); ip_tunnel_flags_copy(p->i_flags, flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { ip_tunnel_flags_from_be16(flags, GRE_KEY); ip_tunnel_flags_or(p->i_flags, p->i_flags, flags); ip_tunnel_flags_or(p->o_flags, p->o_flags, flags); } return 0; } static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } static int vti_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; dev->addr_len = 4; dev->lltx = true; netif_keep_dst(dev); return ip_tunnel_init(dev); } static void __net_init vti_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; iph->version = 4; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int vti_rcv_tunnel(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false); } static struct xfrm_tunnel vti_ipip_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #if IS_ENABLED(CONFIG_IPV6) static struct xfrm_tunnel vti_ipip6_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #endif #endif static int __net_init vti_init_net(struct net *net) { int err; struct ip_tunnel_net *itn; err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); if (err) return err; itn = net_generic(net, vti_net_id); if (itn->fb_tunnel_dev) vti_fb_tunnel_init(itn->fb_tunnel_dev); return 0; } static void __net_exit vti_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, .exit_rtnl = vti_exit_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void vti_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_IPIP; if (!data) return; __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags); if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_LOCAL]) parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_FWMARK]) *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct ip_tunnel_parm_kern parms; struct nlattr **tb = params->tb; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t vti_get_size(const struct net_device *dev) { return /* IFLA_VTI_LINK */ nla_total_size(4) + /* IFLA_VTI_IKEY */ nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + /* IFLA_VTI_LOCAL */ nla_total_size(4) + /* IFLA_VTI_REMOTE */ nla_total_size(4) + /* IFLA_VTI_FWMARK */ nla_total_size(4) + 0; } static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) || nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) || nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark)) return -EMSGSIZE; return 0; } static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti_link_ops __read_mostly = { .kind = "vti", .maxtype = IFLA_VTI_MAX, .policy = vti_policy, .priv_size = sizeof(struct ip_tunnel), .setup = vti_tunnel_setup, .validate = vti_tunnel_validate, .newlink = vti_newlink, .changelink = vti_changelink, .dellink = ip_tunnel_dellink, .get_size = vti_get_size, .fill_info = vti_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static int __init vti_init(void) { const char *msg; int err; pr_info("IPv4 over IPsec tunneling driver\n"); msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) goto pernet_dev_failed; msg = "tunnel protocols"; err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) msg = "ipip tunnel"; err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; #if IS_ENABLED(CONFIG_IPV6) err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif #endif msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) goto rtnl_link_failed; return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); xfrm_tunnel_ipip6_failed: #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); pernet_dev_failed: pr_err("vti init: failed to register %s\n", msg); return err; } static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); } module_init(vti_init); module_exit(vti_fini); MODULE_DESCRIPTION("Virtual (secure) IP tunneling library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti"); MODULE_ALIAS_NETDEV("ip_vti0");
125 125 125 125 125 125 125 125 125 1 125 162 1 1 136 24 126 126 3 3 263 1 43 143 16 26 136 61 236 12 5 12 53 1 51 52 53 53 53 53 11 117 5 6 3 26 78 5 43 55 91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 // SPDX-License-Identifier: GPL-2.0-or-later /* Asymmetric public-key cryptography key type * * See Documentation/crypto/asymmetric-keys.rst * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <keys/asymmetric-subtype.h> #include <keys/asymmetric-parser.h> #include <crypto/public_key.h> #include <linux/seq_file.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/ctype.h> #include <keys/system_keyring.h> #include <keys/user-type.h> #include "asymmetric_keys.h" static LIST_HEAD(asymmetric_key_parsers); static DECLARE_RWSEM(asymmetric_key_parsers_sem); /** * find_asymmetric_key - Find a key by ID. * @keyring: The keys to search. * @id_0: The first ID to look for or NULL. * @id_1: The second ID to look for or NULL, matched together with @id_0 * against @keyring keys' id[0] and id[1]. * @id_2: The fallback ID to match against @keyring keys' id[2] if both of the * other IDs are NULL. * @partial: Use partial match for @id_0 and @id_1 if true, exact if false. * * Find a key in the given keyring by identifier. The preferred identifier is * the id_0 and the fallback identifier is the id_1. If both are given, the * former is matched (exactly or partially) against either of the sought key's * identifiers and the latter must match the found key's second identifier * exactly. If both are missing, id_2 must match the sought key's third * identifier exactly. */ struct key *find_asymmetric_key(struct key *keyring, const struct asymmetric_key_id *id_0, const struct asymmetric_key_id *id_1, const struct asymmetric_key_id *id_2, bool partial) { struct key *key; key_ref_t ref; const char *lookup; char *req, *p; int len; if (id_0) { lookup = id_0->data; len = id_0->len; } else if (id_1) { lookup = id_1->data; len = id_1->len; } else if (id_2) { lookup = id_2->data; len = id_2->len; } else { WARN_ON(1); return ERR_PTR(-EINVAL); } /* Construct an identifier "id:<keyid>". */ p = req = kmalloc(2 + 1 + len * 2 + 1, GFP_KERNEL); if (!req) return ERR_PTR(-ENOMEM); if (!id_0 && !id_1) { *p++ = 'd'; *p++ = 'n'; } else if (partial) { *p++ = 'i'; *p++ = 'd'; } else { *p++ = 'e'; *p++ = 'x'; } *p++ = ':'; p = bin2hex(p, lookup, len); *p = 0; pr_debug("Look up: \"%s\"\n", req); ref = keyring_search(make_key_ref(keyring, 1), &key_type_asymmetric, req, true); if (IS_ERR(ref)) pr_debug("Request for key '%s' err %ld\n", req, PTR_ERR(ref)); kfree(req); if (IS_ERR(ref)) { switch (PTR_ERR(ref)) { /* Hide some search errors */ case -EACCES: case -ENOTDIR: case -EAGAIN: return ERR_PTR(-ENOKEY); default: return ERR_CAST(ref); } } key = key_ref_to_ptr(ref); if (id_0 && id_1) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); if (!kids->id[1]) { pr_debug("First ID matches, but second is missing\n"); goto reject; } if (!asymmetric_key_id_same(id_1, kids->id[1])) { pr_debug("First ID matches, but second does not\n"); goto reject; } } pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key)); return key; reject: key_put(key); return ERR_PTR(-EKEYREJECTED); } EXPORT_SYMBOL_GPL(find_asymmetric_key); /** * asymmetric_key_generate_id: Construct an asymmetric key ID * @val_1: First binary blob * @len_1: Length of first binary blob * @val_2: Second binary blob * @len_2: Length of second binary blob * * Construct an asymmetric key ID from a pair of binary blobs. */ struct asymmetric_key_id *asymmetric_key_generate_id(const void *val_1, size_t len_1, const void *val_2, size_t len_2) { struct asymmetric_key_id *kid; kid = kmalloc(sizeof(struct asymmetric_key_id) + len_1 + len_2, GFP_KERNEL); if (!kid) return ERR_PTR(-ENOMEM); kid->len = len_1 + len_2; memcpy(kid->data, val_1, len_1); memcpy(kid->data + len_1, val_2, len_2); return kid; } EXPORT_SYMBOL_GPL(asymmetric_key_generate_id); /** * asymmetric_key_id_same - Return true if two asymmetric keys IDs are the same. * @kid1: The key ID to compare * @kid2: The key ID to compare */ bool asymmetric_key_id_same(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2) { if (!kid1 || !kid2) return false; if (kid1->len != kid2->len) return false; return memcmp(kid1->data, kid2->data, kid1->len) == 0; } EXPORT_SYMBOL_GPL(asymmetric_key_id_same); /** * asymmetric_key_id_partial - Return true if two asymmetric keys IDs * partially match * @kid1: The key ID to compare * @kid2: The key ID to compare */ bool asymmetric_key_id_partial(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2) { if (!kid1 || !kid2) return false; if (kid1->len < kid2->len) return false; return memcmp(kid1->data + (kid1->len - kid2->len), kid2->data, kid2->len) == 0; } EXPORT_SYMBOL_GPL(asymmetric_key_id_partial); /** * asymmetric_match_key_ids - Search asymmetric key IDs 1 & 2 * @kids: The pair of key IDs to check * @match_id: The key ID we're looking for * @match: The match function to use */ static bool asymmetric_match_key_ids( const struct asymmetric_key_ids *kids, const struct asymmetric_key_id *match_id, bool (*match)(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2)) { int i; if (!kids || !match_id) return false; for (i = 0; i < 2; i++) if (match(kids->id[i], match_id)) return true; return false; } /* helper function can be called directly with pre-allocated memory */ inline int __asymmetric_key_hex_to_key_id(const char *id, struct asymmetric_key_id *match_id, size_t hexlen) { match_id->len = hexlen; return hex2bin(match_id->data, id, hexlen); } /** * asymmetric_key_hex_to_key_id - Convert a hex string into a key ID. * @id: The ID as a hex string. */ struct asymmetric_key_id *asymmetric_key_hex_to_key_id(const char *id) { struct asymmetric_key_id *match_id; size_t asciihexlen; int ret; if (!*id) return ERR_PTR(-EINVAL); asciihexlen = strlen(id); if (asciihexlen & 1) return ERR_PTR(-EINVAL); match_id = kmalloc(sizeof(struct asymmetric_key_id) + asciihexlen / 2, GFP_KERNEL); if (!match_id) return ERR_PTR(-ENOMEM); ret = __asymmetric_key_hex_to_key_id(id, match_id, asciihexlen / 2); if (ret < 0) { kfree(match_id); return ERR_PTR(-EINVAL); } return match_id; } /* * Match asymmetric keys by an exact match on one of the first two IDs. */ static bool asymmetric_key_cmp(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return asymmetric_match_key_ids(kids, match_id, asymmetric_key_id_same); } /* * Match asymmetric keys by a partial match on one of the first two IDs. */ static bool asymmetric_key_cmp_partial(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return asymmetric_match_key_ids(kids, match_id, asymmetric_key_id_partial); } /* * Match asymmetric keys by an exact match on the third IDs. */ static bool asymmetric_key_cmp_name(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return kids && asymmetric_key_id_same(kids->id[2], match_id); } /* * Preparse the match criterion. If we don't set lookup_type and cmp, * the default will be an exact match on the key description. * * There are some specifiers for matching key IDs rather than by the key * description: * * "id:<id>" - find a key by partial match on one of the first two IDs * "ex:<id>" - find a key by exact match on one of the first two IDs * "dn:<id>" - find a key by exact match on the third ID * * These have to be searched by iteration rather than by direct lookup because * the key is hashed according to its description. */ static int asymmetric_key_match_preparse(struct key_match_data *match_data) { struct asymmetric_key_id *match_id; const char *spec = match_data->raw_data; const char *id; bool (*cmp)(const struct key *, const struct key_match_data *) = asymmetric_key_cmp; if (!spec || !*spec) return -EINVAL; if (spec[0] == 'i' && spec[1] == 'd' && spec[2] == ':') { id = spec + 3; cmp = asymmetric_key_cmp_partial; } else if (spec[0] == 'e' && spec[1] == 'x' && spec[2] == ':') { id = spec + 3; } else if (spec[0] == 'd' && spec[1] == 'n' && spec[2] == ':') { id = spec + 3; cmp = asymmetric_key_cmp_name; } else { goto default_match; } match_id = asymmetric_key_hex_to_key_id(id); if (IS_ERR(match_id)) return PTR_ERR(match_id); match_data->preparsed = match_id; match_data->cmp = cmp; match_data->lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE; return 0; default_match: return 0; } /* * Free the preparsed the match criterion. */ static void asymmetric_key_match_free(struct key_match_data *match_data) { kfree(match_data->preparsed); } /* * Describe the asymmetric key */ static void asymmetric_key_describe(const struct key *key, struct seq_file *m) { const struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key); const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *kid; const unsigned char *p; int n; seq_puts(m, key->description); if (subtype) { seq_puts(m, ": "); subtype->describe(key, m); if (kids && kids->id[1]) { kid = kids->id[1]; seq_putc(m, ' '); n = kid->len; p = kid->data; if (n > 4) { p += n - 4; n = 4; } seq_printf(m, "%*phN", n, p); } seq_puts(m, " ["); /* put something here to indicate the key's capabilities */ seq_putc(m, ']'); } } /* * Preparse a asymmetric payload to get format the contents appropriately for the * internal payload to cut down on the number of scans of the data performed. * * We also generate a proposed description from the contents of the key that * can be used to name the key if the user doesn't want to provide one. */ static int asymmetric_key_preparse(struct key_preparsed_payload *prep) { struct asymmetric_key_parser *parser; int ret; pr_devel("==>%s()\n", __func__); if (prep->datalen == 0) return -EINVAL; down_read(&asymmetric_key_parsers_sem); ret = -EBADMSG; list_for_each_entry(parser, &asymmetric_key_parsers, link) { pr_debug("Trying parser '%s'\n", parser->name); ret = parser->parse(prep); if (ret != -EBADMSG) { pr_debug("Parser recognised the format (ret %d)\n", ret); break; } } up_read(&asymmetric_key_parsers_sem); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /* * Clean up the key ID list */ static void asymmetric_key_free_kids(struct asymmetric_key_ids *kids) { int i; if (kids) { for (i = 0; i < ARRAY_SIZE(kids->id); i++) kfree(kids->id[i]); kfree(kids); } } /* * Clean up the preparse data */ static void asymmetric_key_free_preparse(struct key_preparsed_payload *prep) { struct asymmetric_key_subtype *subtype = prep->payload.data[asym_subtype]; struct asymmetric_key_ids *kids = prep->payload.data[asym_key_ids]; pr_devel("==>%s()\n", __func__); if (subtype) { subtype->destroy(prep->payload.data[asym_crypto], prep->payload.data[asym_auth]); module_put(subtype->owner); } asymmetric_key_free_kids(kids); kfree(prep->description); } /* * dispose of the data dangling from the corpse of a asymmetric key */ static void asymmetric_key_destroy(struct key *key) { struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key); struct asymmetric_key_ids *kids = key->payload.data[asym_key_ids]; void *data = key->payload.data[asym_crypto]; void *auth = key->payload.data[asym_auth]; key->payload.data[asym_crypto] = NULL; key->payload.data[asym_subtype] = NULL; key->payload.data[asym_key_ids] = NULL; key->payload.data[asym_auth] = NULL; if (subtype) { subtype->destroy(data, auth); module_put(subtype->owner); } asymmetric_key_free_kids(kids); } static struct key_restriction *asymmetric_restriction_alloc( key_restrict_link_func_t check, struct key *key) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; keyres->key = key; keyres->keytype = &key_type_asymmetric; return keyres; } /* * look up keyring restrict functions for asymmetric keys */ static struct key_restriction *asymmetric_lookup_restriction( const char *restriction) { char *restrict_method; char *parse_buf; char *next; struct key_restriction *ret = ERR_PTR(-EINVAL); if (strcmp("builtin_trusted", restriction) == 0) return asymmetric_restriction_alloc( restrict_link_by_builtin_trusted, NULL); if (strcmp("builtin_and_secondary_trusted", restriction) == 0) return asymmetric_restriction_alloc( restrict_link_by_builtin_and_secondary_trusted, NULL); parse_buf = kstrndup(restriction, PAGE_SIZE, GFP_KERNEL); if (!parse_buf) return ERR_PTR(-ENOMEM); next = parse_buf; restrict_method = strsep(&next, ":"); if ((strcmp(restrict_method, "key_or_keyring") == 0) && next) { char *key_text; key_serial_t serial; struct key *key; key_restrict_link_func_t link_fn = restrict_link_by_key_or_keyring; bool allow_null_key = false; key_text = strsep(&next, ":"); if (next) { if (strcmp(next, "chain") != 0) goto out; link_fn = restrict_link_by_key_or_keyring_chain; allow_null_key = true; } if (kstrtos32(key_text, 0, &serial) < 0) goto out; if ((serial == 0) && allow_null_key) { key = NULL; } else { key = key_lookup(serial); if (IS_ERR(key)) { ret = ERR_CAST(key); goto out; } } ret = asymmetric_restriction_alloc(link_fn, key); if (IS_ERR(ret)) key_put(key); } out: kfree(parse_buf); return ret; } int asymmetric_key_eds_op(struct kernel_pkey_params *params, const void *in, void *out) { const struct asymmetric_key_subtype *subtype; struct key *key = params->key; int ret; pr_devel("==>%s()\n", __func__); if (key->type != &key_type_asymmetric) return -EINVAL; subtype = asymmetric_key_subtype(key); if (!subtype || !key->payload.data[0]) return -EINVAL; if (!subtype->eds_op) return -ENOTSUPP; ret = subtype->eds_op(params, in, out); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } static int asymmetric_key_verify_signature(struct kernel_pkey_params *params, const void *in, const void *in2) { struct public_key_signature sig = { .s_size = params->in2_len, .digest_size = params->in_len, .encoding = params->encoding, .hash_algo = params->hash_algo, .digest = (void *)in, .s = (void *)in2, }; return verify_signature(params->key, &sig); } struct key_type key_type_asymmetric = { .name = "asymmetric", .preparse = asymmetric_key_preparse, .free_preparse = asymmetric_key_free_preparse, .instantiate = generic_key_instantiate, .match_preparse = asymmetric_key_match_preparse, .match_free = asymmetric_key_match_free, .destroy = asymmetric_key_destroy, .describe = asymmetric_key_describe, .lookup_restriction = asymmetric_lookup_restriction, .asym_query = query_asymmetric_key, .asym_eds_op = asymmetric_key_eds_op, .asym_verify_signature = asymmetric_key_verify_signature, }; EXPORT_SYMBOL_GPL(key_type_asymmetric); /** * register_asymmetric_key_parser - Register a asymmetric key blob parser * @parser: The parser to register */ int register_asymmetric_key_parser(struct asymmetric_key_parser *parser) { struct asymmetric_key_parser *cursor; int ret; down_write(&asymmetric_key_parsers_sem); list_for_each_entry(cursor, &asymmetric_key_parsers, link) { if (strcmp(cursor->name, parser->name) == 0) { pr_err("Asymmetric key parser '%s' already registered\n", parser->name); ret = -EEXIST; goto out; } } list_add_tail(&parser->link, &asymmetric_key_parsers); pr_notice("Asymmetric key parser '%s' registered\n", parser->name); ret = 0; out: up_write(&asymmetric_key_parsers_sem); return ret; } EXPORT_SYMBOL_GPL(register_asymmetric_key_parser); /** * unregister_asymmetric_key_parser - Unregister a asymmetric key blob parser * @parser: The parser to unregister */ void unregister_asymmetric_key_parser(struct asymmetric_key_parser *parser) { down_write(&asymmetric_key_parsers_sem); list_del(&parser->link); up_write(&asymmetric_key_parsers_sem); pr_notice("Asymmetric key parser '%s' unregistered\n", parser->name); } EXPORT_SYMBOL_GPL(unregister_asymmetric_key_parser); /* * Module stuff */ static int __init asymmetric_key_init(void) { return register_key_type(&key_type_asymmetric); } static void __exit asymmetric_key_cleanup(void) { unregister_key_type(&key_type_asymmetric); } module_init(asymmetric_key_init); module_exit(asymmetric_key_cleanup);
31 22 101 9 2 1 11 24 24 48 53 8 39 70 141 9 37 19 4559 82 449 448 450 450 3983 3985 3984 3978 3985 72 249 250 7 77 70 70 143 183 105 119 1279 73 73 2 27 73 29 29 73 71 1 1296 1289 73 70 71 70 71 43 1226 1295 1296 150 149 46 46 46 46 46 46 46 46 2 46 45 1 46 33 36 6 42 42 42 81 82 81 77 53 75 76 76 75 75 75 75 75 12 76 82 82 42 76 76 76 76 76 73 3 73 3 138 146 53 2 4 199 47 116 76 100 76 101 101 21 101 118 217 89 141 218 219 89 143 218 52 170 219 38 6 37 5 38 118 45 73 25 48 101 86 13 53 53 53 7 3 1 1 1 1 1 1 113 70 70 67 69 70 69 69 2 2 121 1 117 120 121 170 170 112 112 1 111 114 114 113 114 114 28 11 85 12 85 79 27 75 8 71 63 71 65 70 71 71 70 71 71 39 83 18 71 40 46 85 71 71 18 71 41 67 85 85 70 71 22 16 21 85 85 74 85 74 79 29 55 78 78 78 79 78 79 85 85 78 13 13 13 74 29 73 74 73 9 33 40 9 33 41 40 77 27 51 11 40 10 36 45 6 45 45 45 45 45 3 3 2 2 2 1 1 45 40 44 45 39 45 77 78 79 79 79 22 22 40 45 116 118 118 118 118 13 13 101 70 117 118 118 116 118 45 101 100 118 118 118 45 45 45 45 45 45 45 33 45 37 37 45 13 45 45 44 13 45 33 5 25 5 49 48 1 46 46 1 33 35 46 6 32 45 22 33 33 29 42 42 13 42 1 1 2 1 3 3 3 3 3 1 2 2 6 1 5 3 3 1 2 4 2 2 1 3 3 3 6 3 2 1 4 1 1 2 2 2 2 2 2 2 2 2 6 6 6 6 3 3 6 6 1 1 11 2 1 8 2 2 1 2 2 4 2 1 1 2 67 67 66 30 24 6 30 284 8 277 263 18 3 8 2 4 1 6 6 79 75 79 9 70 128 128 111 12 3 1 101 101 29 115 116 116 116 116 116 116 101 38 65 61 31 8 2 2 107 76 75 75 193 75 75 74 3 74 82 72 73 69 3 7 2 96 7 94 2 78 73 6 2 7 96 96 80 2 80 2 79 80 1 1 95 94 1 80 77 79 76 80 81 93 5 7 73 84 8 87 96 95 81 93 1 93 79 92 17 77 89 4 89 2 6 3 3 8 8 9 9 8 2 3 7 7 7 8 9 22 9 22 17 4 19 1 19 3 17 16 2 4 4 101 101 101 101 101 101 35 96 101 100 101 101 101 14 73 72 1 73 1 52 21 73 72 1 73 25 48 65 5 87 1 69 84 14 73 38 38 38 37 6 1 5 5 5 5 5 5 5 5 5 6 1 4 28 28 28 28 28 28 28 28 28 28 28 27 28 1792 9 9 10 6 1 1229 11 1241 1241 1240 1230 1 10 2 3 9 8 2 9 9 11 1231 1233 1227 15 1230 3 1239 1241 11 1233 15 15 15 1226 3 1223 1226 1227 1227 1130 97 97 1226 139 139 139 17 9 1 9 9 4 4 4 2 5 5 7 7 3 1 2 3867 3870 2422 2419 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 /* * Generic process-grouping system. * * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * * Notifications support * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * --------------------------------------------------- * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "cgroup-internal.h" #include <linux/bpf-cgroup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/magic.h> #include <linux/mutex.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/hashtable.h> #include <linux/idr.h> #include <linux/kthread.h> #include <linux/atomic.h> #include <linux/cpuset.h> #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/sched/deadline.h> #include <linux/psi.h> #include <net/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * To avoid confusing the compiler (and generating warnings) with code * that attempts to access what would be a 0-element array (i.e. sized * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this * constant expression can be added. */ #define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif struct blocking_notifier_head cgroup_lifetime_notifier = BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier); DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); /* * Protects cgroup_file->kn for !self csses. It synchronizes notifications * against file removal/re-creation across css hiding. */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_destroy_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, struct cgroup_subsys *cgroup_subsys[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of cgroup subsystem names */ #define SUBSYS(_x) [_x ## _cgrp_id] = #_x, static const char *cgroup_subsys_name[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ #define SUBSYS(_x) \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, static struct static_key_true *cgroup_subsys_enabled_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, static struct static_key_true *cgroup_subsys_on_dfl_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu); static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu); /* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.self.rstat_cpu = &root_rstat_cpu, .cgrp.rstat_base_cpu = &root_rstat_base_cpu, }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ static u16 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); /* * Assign a monotonically increasing serial number to csses. It guarantees * cgroups with bigger numbers are newer than those with smaller numbers. * Also, as csses are always appended to the parent's ->children list, it * guarantees that sibling csses are always sorted in the ascending serial * number order on the list. Protected by cgroup_mutex. */ static u64 css_serial_nr_next = 1; /* * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; static struct cftype cgroup_psi_files[]; /* cgroup optional features */ enum cgroup_opt_features { #ifdef CONFIG_PSI OPT_FEATURE_PRESSURE, #endif OPT_FEATURE_COUNT }; static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { #ifdef CONFIG_PSI "pressure", #endif }; static u16 cgroup_feature_disable_mask __read_mostly; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline #define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); #include <linux/cgroup_refcnt.h> #endif /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest * * cgroup_subsys_enabled() can only be used with literal subsys names which * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ bool cgroup_ssid_enabled(int ssid) { if (!CGROUP_HAS_SUBSYS_CONFIG) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" * and "name" are disallowed. * * - When mounting an existing superblock, mount options should match. * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. * * - "cgroup.clone_children" is removed. * * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup * and its descendants contain no task; otherwise, 1. The file also * generates kernfs notification which can be monitored through poll and * [di]notify when the value of the file changes. * * - cpuset: tasks will be kept in empty cpusets when hotplug happens and * take masks of ancestors with non-empty cpus/mems, instead of being * moved to an ancestor. * * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) { int ret; idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); spin_unlock_bh(&cgroup_idr_lock); } static bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; } static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } /* can @cgrp host both domain and threaded children? */ static bool cgroup_is_mixable(struct cgroup *cgrp) { /* * Root isn't under domain level resource control exempting it from * the no-internal-process constraint, so it can serve as a thread * root and a parent of resource domains at the same time. */ return !cgroup_parent(cgrp); } /* can @cgrp become a thread root? Should always be true for a thread root */ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return true; /* domain roots can't be nested under threaded */ if (cgroup_is_threaded(cgrp)) return false; /* can only have either domain or threaded children */ if (cgrp->nr_populated_domain_children) return false; /* and no domain controllers can be enabled */ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return false; return true; } /* is @cgrp root of a threaded subtree? */ static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) return false; /* a domain w/ threaded children is a thread root */ if (cgrp->nr_threaded_children) return true; /* * A domain which has tasks and explicit threaded controllers * enabled is a thread root. */ if (cgroup_has_tasks(cgrp) && (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) return true; return false; } /* a domain which isn't connected to the root w/o brekage can't be used */ static bool cgroup_is_valid_domain(struct cgroup *cgrp) { /* the cgroup itself can be a thread root */ if (cgroup_is_threaded(cgrp)) return false; /* but the ancestors can't be unless mixable */ while ((cgrp = cgroup_parent(cgrp))) { if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) return false; if (cgroup_is_threaded(cgrp)) return false; } return true; } /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; if (parent) { u16 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | cgrp_dfl_implicit_ss_mask); return root_ss_mask; } /* subsystems enabled on a cgroup */ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { u16 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } return cgrp->root->subsys_mask; } /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and * the caller is responsible for pinning the returned css if it wants to * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else return &cgrp->self; } /** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); if (!ss) return &cgrp->self; /* * This function is used while updating css associations and thus * can't test the csses directly. Test ss_mask. */ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); if (!cgrp) return NULL; } return cgroup_css(cgrp, ss); } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the * callers responsibility to try get a reference for it. */ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; do { css = cgroup_css(cgrp, ss); if (css) return css; cgrp = cgroup_parent(cgrp); } while (cgrp); return init_css_set.subsys[ss->id]; } /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * The returned css must be put using css_put(). */ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; rcu_read_lock(); do { css = cgroup_css(cgrp, ss); if (css && css_tryget_online(css)) goto out_unlock; cgrp = cgroup_parent(cgrp); } while (cgrp); css = init_css_set.subsys[ss->id]; css_get(css); out_unlock: rcu_read_unlock(); return css; } EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); cgroup_get(cgrp); } /** * __cgroup_task_count - count the number of tasks in a cgroup. The caller * is responsible for taking the css_set_lock. * @cgrp: the cgroup in question */ int __cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += link->cset->nr_tasks; return count; } /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question */ int cgroup_task_count(const struct cgroup *cgrp) { int count; spin_lock_irq(&css_set_lock); count = __cgroup_task_count(cgrp); spin_unlock_irq(&css_set_lock); return count; } static struct cgroup *kn_priv(struct kernfs_node *kn) { struct kernfs_node *parent; /* * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT. * Therefore it is always safe to dereference this pointer outside of a * RCU section. */ parent = rcu_dereference_check(kn->__parent, kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT); return parent->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). * seq_css() is only called from a kernfs file operation which has * an active reference on the file. Because all the subsystem * files are drained before a css is disassociated with a cgroup, * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; } EXPORT_SYMBOL_GPL(of_css); /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ lockdep_is_held(&cgroup_mutex)))) { } \ else /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of * @ss_mask is set. */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ if (!CGROUP_HAS_SUBSYS_CONFIG) { \ (ssid) = 0; \ break; \ } \ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ (ss) = cgroup_subsys[ssid]; \ { #define while_each_subsys_mask() \ } \ } \ } while (false) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else /* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* walk live descendants in postorder */ #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* * The following field is re-initialized when this cset gets linked * in cgroup_init(). However, let's initialize the field * statically too so that the default cgroup can be accessed safely * early during boot. */ .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ static bool css_set_threaded(struct css_set *cset) { return cset->dom_cset != cset; } /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set * * css_set_populated() should be the same as !!cset->nr_tasks at steady * state. However, css_set_populated() can be called while a task is being * added to or removed from the linked list before the nr_tasks is * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { lockdep_assert_held(&css_set_lock); return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); } /** * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first * task or losing the last. Update @cgrp->nr_populated_* accordingly. The * count is propagated towards root so that a given cgroup's * nr_populated_children is zero iff none of its descendants contain any * tasks. * * @cgrp's interface file "cgroup.populated" is zero if both * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and * 1 otherwise. When the sum changes from or to zero, userland is notified * that the content of the interface file has changed. This can be used to * detect when @cgrp and its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { struct cgroup *child = NULL; int adj = populated ? 1 : -1; lockdep_assert_held(&css_set_lock); do { bool was_populated = cgroup_is_populated(cgrp); if (!child) { cgrp->nr_populated_csets += adj; } else { if (cgroup_is_threaded(child)) cgrp->nr_populated_threaded_children += adj; else cgrp->nr_populated_domain_children += adj; } if (was_populated == cgroup_is_populated(cgrp)) break; cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } /** * css_set_update_populated - update populated state of a css_set * @cset: target css_set * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the * populated counters of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) cgroup_update_populated(link->cgrp, populated); } /* * @task is leaving, advance task iterators which are pointing to it so * that they can resume at the next position. Advancing an iterator might * remove it from the list, use safe walk. See css_task_iter_skip() for * details. */ static void css_set_skip_task_iters(struct css_set *cset, struct task_struct *task) { struct css_task_iter *it, *pos; list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) css_task_iter_skip(it, task); } /** * css_set_move_task - move a task from one css_set to another * @task: task being moved * @from_cset: css_set @task currently belongs to (may be NULL) * @to_cset: new css_set @task is being moved to (may be NULL) * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks * * Move @task from @from_cset to @to_cset. If @task didn't belong to any * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, bool use_mg_tasks) { lockdep_assert_held(&css_set_lock); if (to_cset && !css_set_populated(to_cset)) css_set_update_populated(to_cset, true); if (from_cset) { WARN_ON_ONCE(list_empty(&task->cg_list)); css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); } else { WARN_ON_ONCE(!list_empty(&task->cg_list)); } if (to_cset) { /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race * against cgroup_exit()/cgroup_free() dropping the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } } /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) key += (unsigned long)css[i]; key = (key >> 16) ^ key; return key; } void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&css_set_lock); if (!refcount_dec_and_test(&cset->refcount)) return; WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); /* This css_set is dead. Unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); css_put(cset->subsys[ssid]); } hash_del(&cset->hlist); css_set_count--; list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); if (cgroup_parent(link->cgrp)) cgroup_put(link->cgrp); kfree(link); } if (css_set_threaded(cset)) { list_del(&cset->threaded_csets_node); put_css_set_locked(cset->dom_cset); } kfree_rcu(cset, rcu_head); } /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* * On the default hierarchy, there can be csets which are * associated with the same set of cgroups but different csses. * Let's first ensure that csses match. */ if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; /* @cset's domain should match the default cgroup's */ if (cgroup_on_dfl(new_cgrp)) new_dfl_cgrp = new_cgrp; else new_dfl_cgrp = old_cset->dfl_cgrp; if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) return false; /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may * share the same effective css, this comparison is always * necessary. */ l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ if (l1 == &cset->cgrp_links) { BUG_ON(l2 != &old_cset->cgrp_links); break; } else { BUG_ON(l2 == &old_cset->cgrp_links); } /* Locate the cgroups associated with these links. */ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); cgrp1 = link1->cgrp; cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */ BUG_ON(cgrp1->root != cgrp2->root); /* * If this hierarchy is the hierarchy of the cgroup * that's changing, then we need to check that this * css_set points to the new cgroup; if it's any other * hierarchy, then this css_set should point to the * same cgroup as the old css_set. */ if (cgrp1->root == new_cgrp->root) { if (cgrp1 != new_cgrp) return false; } else { if (cgrp1 != cgrp2) return false; } } return true; } /** * find_existing_css_set - init css array and find the matching css_set * @old_cset: the css_set that we're using before the cgroup transition * @cgrp: the cgroup that we're moving into * @template: out param for the new set of csses, should be clear on entry */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; int i; /* * Build the set of subsystem state objects that we want to see in the * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking. */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want * to change the css. */ template[i] = old_cset->subsys[i]; } } key = css_set_hash(template); hash_for_each_possible(css_set_table, cset, hlist, key) { if (!compare_css_sets(cset, old_cset, cgrp, template)) continue; /* This css_set matches what we need */ return cset; } /* No existing cgroup group matched */ return NULL; } static void free_cgrp_cset_links(struct list_head *links_to_free) { struct cgrp_cset_link *link, *tmp_link; list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { list_del(&link->cset_link); kfree(link); } } /** * allocate_cgrp_cset_links - allocate cgrp_cset_links * @count: the number of links to allocate * @tmp_links: list_head the allocated links are put on * * Allocate @count cgrp_cset_link structures and chain them on @tmp_links * through ->cset_link. Returns 0 on success or -errno. */ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) { struct cgrp_cset_link *link; int i; INIT_LIST_HEAD(tmp_links); for (i = 0; i < count; i++) { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { free_cgrp_cset_links(tmp_links); return -ENOMEM; } list_add(&link->cset_link, tmp_links); } return 0; } /** * link_css_set - a helper function to link a css_set to a cgroup * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup */ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp) { struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); if (cgroup_on_dfl(cgrp)) cset->dfl_cgrp = cgrp; link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; /* * Always add links to the tail of the lists so that the lists are * in chronological order. */ list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) cgroup_get_live(cgrp); } /** * find_css_set - return a new css_set with one cgroup updated * @old_cset: the baseline css_set * @cgrp: the cgroup to be updated * * Return a new css_set that's equivalent to @old_cset, but with @cgrp * substituted into the appropriate hierarchy. */ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp) { struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; struct cgroup_subsys *ss; unsigned long key; int ssid; lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches * the desired set */ spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); spin_unlock_irq(&css_set_lock); if (cset) return cset; cset = kzalloc(sizeof(*cset), GFP_KERNEL); if (!cset) return NULL; /* Allocate all the cgrp_cset_link objects that we'll need */ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { kfree(cset); return NULL; } refcount_set(&cset->refcount, 1); cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_src_preload_node); INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == cgrp->root) c = cgrp; link_css_set(&tmp_links, cset, c); } BUG_ON(!list_empty(&tmp_links)); css_set_count++; /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cset->subsys[ssid]; list_add_tail(&cset->e_cset_node[ssid], &css->cgroup->e_csets[ssid]); css_get(css); } spin_unlock_irq(&css_set_lock); /* * If @cset should be threaded, look up the matching dom_cset and * link them up. We first fully initialize @cset then look for the * dom_cset. It's simpler this way and safe as @cset is guaranteed * to stay empty until we return. */ if (cgroup_is_threaded(cset->dfl_cgrp)) { struct css_set *dcset; dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); if (!dcset) { put_css_set(cset); return NULL; } spin_lock_irq(&css_set_lock); cset->dom_cset = dcset; list_add_tail(&cset->threaded_csets_node, &dcset->threaded_csets); spin_unlock_irq(&css_set_lock); } return cset; } struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ if (favor && !favoring) { rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } } static int cgroup_init_root_id(struct cgroup_root *root) { int id; lockdep_assert_held(&cgroup_mutex); id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); if (id < 0) return id; root->hierarchy_id = id; return 0; } static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } void cgroup_free_root(struct cgroup_root *root) { kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; int ret; trace_cgroup_destroy_root(root); cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, CGROUP_LIFETIME_OFFLINE, cgrp); WARN_ON_ONCE(notifier_to_errno(ret)); /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's * root cgroup */ spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } spin_unlock_irq(&css_set_lock); WARN_ON_ONCE(list_empty(&root->root_list)); list_del_rcu(&root->root_list); cgroup_root_count--; if (!have_favordynmods) cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); cgroup_unlock(); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } /* * Returned cgroup is without refcount but it's valid as long as cset pins it. */ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { struct cgroup *res_cgroup = NULL; if (cset == &init_css_set) { res_cgroup = &root->cgrp; } else if (root == &cgrp_dfl_root) { res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == root) { res_cgroup = c; break; } } } /* * If cgroup_mutex is not held, the cgrp_cset_link will be freed * before we remove the cgroup root from the root_list. Consequently, * when accessing a cgroup root, the cset_link may have already been * freed, resulting in a NULL res_cgroup. However, by holding the * cgroup_mutex, we ensure that res_cgroup can't be NULL. * If we don't hold cgroup_mutex in the caller, we must do the NULL * check. */ return res_cgroup; } /* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy */ static struct cgroup * current_cgns_cgroup_from_root(struct cgroup_root *root) { struct cgroup *res = NULL; struct css_set *cset; lockdep_assert_held(&css_set_lock); rcu_read_lock(); cset = current->nsproxy->cgroup_ns->root_cset; res = __cset_cgroup_from_root(cset, root); rcu_read_unlock(); /* * The namespace_sem is held by current, so the root cgroup can't * be umounted. Therefore, we can ensure that the res is non-NULL. */ WARN_ON_ONCE(!res); return res; } /* * Look up cgroup associated with current task's cgroup namespace on the default * hierarchy. * * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu * pointers. * - css_set_lock is not needed because we just read cset->dfl_cgrp. * - As a bonus returned cgrp is pinned with the current because it cannot * switch cgroup_ns asynchronously. */ static struct cgroup *current_cgns_cgroup_dfl(void) { struct css_set *cset; if (current->nsproxy) { cset = current->nsproxy->cgroup_ns->root_cset; return __cset_cgroup_from_root(cset, &cgrp_dfl_root); } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() * on a task which has already passed exit_task_namespaces() and * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all * cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } } /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); } /* * Return the cgroup for "task" from the given hierarchy. Must be * called with css_set_lock held to prevent task's groups from being modified. * Must be called with either cgroup_mutex or rcu read lock to prevent the * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* * No need to lock the task - since we hold css_set_lock the * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } /* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to * increment the count). So code holding cgroup_mutex can safely * assume that if the count is zero, it will stay zero. Similarly, if * a task holds cgroup_mutex on a cgroup with zero count, it * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { struct cgroup_subsys *ss = cft->ss; if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : ""; snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); } else { strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); } return buf; } /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write) { if (cft->flags & CFTYPE_WORLD_WRITABLE) mode |= S_IWUGO; else mode |= S_IWUSR; } return mode; } /** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { u16 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; cur_ss_mask = new_ss_mask; } return cur_ss_mask; } /** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * * This helper undoes cgroup_kn_lock_live() and should be invoked before * the method finishes if locking succeeded. Note that once this function * returns the cgroup returned by cgroup_kn_lock_live() may become * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn_priv(kn); cgroup_unlock(); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); } /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn_priv(kn); /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. cgroup liveliness check alone provides enough * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ if (!cgroup_tryget(cgrp)) return NULL; kernfs_break_active_protection(kn); if (drain_offline) cgroup_lock_and_drain_offline(cgrp); else cgroup_lock(); if (!cgroup_is_dead(cgrp)) return cgrp; cgroup_kn_unlock(kn); return NULL; } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); if (cft->file_offset) { struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); timer_delete_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** * css_clear_dir - remove subsys files in a cgroup directory * @css: target css */ static void css_clear_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts; if (!(css->flags & CSS_VISIBLE)) return; css->flags &= ~CSS_VISIBLE; if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); if (cgroup_psi_enabled()) cgroup_addrm_files(css, cgrp, cgroup_psi_files, false); } else { cgroup_addrm_files(css, cgrp, cgroup1_base_files, false); } } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } } /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css * * On failure, no file is added. */ static int css_populate_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; if (css->flags & CSS_VISIBLE) return 0; if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); if (ret < 0) return ret; if (cgroup_psi_enabled()) { ret = cgroup_addrm_files(css, cgrp, cgroup_psi_files, true); if (ret < 0) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); return ret; } } } else { ret = cgroup_addrm_files(css, cgrp, cgroup1_base_files, true); if (ret < 0) return ret; } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); if (ret < 0) { failed_cfts = cfts; goto err; } } } css->flags |= CSS_VISIBLE; return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { if (cfts == failed_cfts) break; cgroup_addrm_files(css, cgrp, cfts, false); } return ret; } int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); do_each_subsys_mask(ss, ssid, ss_mask) { /* * If @ss has non-root csses attached to it, can't move. * If @ss is an implicit controller, it is exempt from this * rule and can be stolen. */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; /* * Collect ssid's that need to be disabled from default * hierarchy. */ if (ss->root == &cgrp_dfl_root) dfl_disable_ss_mask |= 1 << ssid; } while_each_subsys_mask(); if (dfl_disable_ss_mask) { struct cgroup *scgrp = &cgrp_dfl_root.cgrp; /* * Controllers from default hierarchy that need to be rebound * are all disabled together in one go. */ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset, *cset_pos; struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); if (src_root != &cgrp_dfl_root) { /* disable from the source */ src_root->subsys_mask &= ~(1 << ssid); WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; spin_lock_irq(&css_set_lock); css->cgroup = dcgrp; WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); /* * all css_sets of scgrp together in same order to dcgrp, * patch in-flight iterators to preserve correct iteration. * since the iterator is always advanced right away and * finished when it->cset_pos meets it->cset_head, so only * update it->cset_head is enough here. */ list_for_each_entry(it, &cset->task_iters, iters_node) if (it->cset_head == &scgrp->e_csets[ss->id]) it->cset_head = &dcgrp->e_csets[ss->id]; } spin_unlock_irq(&css_set_lock); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } ret = cgroup_apply_control(dcgrp); if (ret) pr_warn("partial failure to rebind %s controller (err=%d)\n", ss->name, ret); if (ss->bind) ss->bind(css); } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; } int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); struct cgroup *ns_cgroup; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) return -ENOMEM; spin_lock_irq(&css_set_lock); ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); if (len == -E2BIG) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); len = 0; } kfree(buf); return len; } enum cgroup2_param { Opt_nsdelegate, Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, Opt_memory_hugetlb_accounting, Opt_pids_localevents, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), fsparam_flag("pids_localevents", Opt_pids_localevents), {} }; static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct fs_parse_result result; int opt; opt = fs_parse(fc, cgroup2_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; case Opt_memory_hugetlb_accounting: ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; return 0; case Opt_pids_localevents: ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; return 0; } return -EINVAL; } struct cgroup_of_peak *of_peak(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; return &ctx->peak; } static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { if (root_flags & CGRP_ROOT_NS_DELEGATE) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; cgroup_favor_dynmods(&cgrp_dfl_root, root_flags & CGRP_ROOT_FAVOR_DYNMODS); if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; else cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS; } } static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) seq_puts(seq, ",memory_hugetlb_accounting"); if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) seq_puts(seq, ",pids_localevents"); return 0; } static int cgroup_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); apply_cgroup_root_flags(ctx->flags); return 0; } static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; int ssid; INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); #ifdef CONFIG_CGROUP_BPF for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++) cgrp->bpf.revisions[i] = 1; #endif init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_fs_context *ctx) { struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); /* DYNMODS must be modified through cgroup_favor_dynmods() */ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); if (ctx->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; lockdep_assert_held(&cgroup_mutex); ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out; /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. Later rebinding may disable * controllers on the default hierarchy and thus create new csets, * which can't be more than the existing ones. Allocate 2x. */ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) goto cancel_ref; kf_sops = root == &cgrp_dfl_root ? &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | KERNFS_ROOT_SUPPORT_USER_XATTR | KERNFS_ROOT_INVARIANT_PARENT, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); goto exit_root_id; } root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root; ret = rebind_subsystems(root, ss_mask); if (ret) goto exit_stats; ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, CGROUP_LIFETIME_ONLINE, root_cgrp); WARN_ON_ONCE(notifier_to_errno(ret)); trace_cgroup_setup_root(root); /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* * Link the root cgroup in this hierarchy into all the css_set * objects. */ spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); ret = 0; goto out; exit_stats: css_rstat_exit(&root_cgrp->self); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); cancel_ref: percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; } int cgroup_do_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; ctx->kfc.root = ctx->root->kf_root; if (fc->fs_type == &cgroup2_fs_type) ctx->kfc.magic = CGROUP2_SUPER_MAGIC; else ctx->kfc.magic = CGROUP_SUPER_MAGIC; ret = kernfs_get_tree(fc); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ if (!ret && ctx->ns != &init_cgroup_ns) { struct dentry *nsdentry; struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; cgroup_lock(); spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); cgroup_unlock(); nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); if (IS_ERR(nsdentry)) { deactivate_locked_super(sb); ret = PTR_ERR(nsdentry); nsdentry = NULL; } fc->root = nsdentry; } if (!ctx->kfc.new_sb_created) cgroup_put(&ctx->root->cgrp); return ret; } /* * Destroy a cgroup filesystem context. */ static void cgroup_fs_context_free(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); kfree(ctx->name); kfree(ctx->release_agent); put_cgroup_ns(ctx->ns); kernfs_free_fs_context(fc); kfree(ctx); } static int cgroup_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; ret = cgroup_do_get_tree(fc); if (!ret) apply_cgroup_root_flags(ctx->flags); return ret; } static const struct fs_context_operations cgroup_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup2_parse_param, .get_tree = cgroup_get_tree, .reconfigure = cgroup_reconfigure, }; static const struct fs_context_operations cgroup1_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup1_parse_param, .get_tree = cgroup1_get_tree, .reconfigure = cgroup1_reconfigure, }; /* * Initialise the cgroup filesystem creation/reconfiguration context. Notably, * we select the namespace we're going to use. */ static int cgroup_init_fs_context(struct fs_context *fc) { struct cgroup_fs_context *ctx; ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; if (fc->fs_type == &cgroup2_fs_type) fc->ops = &cgroup_fs_context_ops; else fc->ops = &cgroup1_fs_context_ops; put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; if (have_favordynmods) ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; return 0; } static void cgroup_kill_sb(struct super_block *sb) { struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && !percpu_ref_is_dying(&root->cgrp.self.refcnt)) percpu_ref_kill(&root->cgrp.self.refcnt); cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } struct file_system_type cgroup_fs_type = { .name = "cgroup", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup1_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup2_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; #ifdef CONFIG_CPUSETS_V1 enum cpuset_param { Opt_cpuset_v2_mode, }; static const struct fs_parameter_spec cpuset_fs_parameters[] = { fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), {} }; static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct fs_parse_result result; int opt; opt = fs_parse(fc, cpuset_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_cpuset_v2_mode: ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; return 0; } return -EINVAL; } static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, .parse_param = cpuset_parse_param, }; /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ static int cpuset_init_fs_context(struct fs_context *fc) { char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER); struct cgroup_fs_context *ctx; int err; err = cgroup_init_fs_context(fc); if (err) { kfree(agent); return err; } fc->ops = &cpuset_fs_context_ops; ctx = cgroup_fc2context(fc); ctx->subsys_mask = 1 << cpuset_cgrp_id; ctx->flags |= CGRP_ROOT_NOPREFIX; ctx->release_agent = agent; get_filesystem(&cgroup_fs_type); put_filesystem(fc->fs_type); fc->fs_type = &cgroup_fs_type; return 0; } static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, .parameters = cpuset_fs_parameters, .fs_flags = FS_USERNS_MOUNT, }; #endif int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { int ret; cgroup_lock(); spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); spin_unlock_irq(&css_set_lock); cgroup_unlock(); return ret; } EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() * implementations (e.g. cpuset), also need to disable CPU hotplug. * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can * lead to deadlocks. * * Bringing up a CPU may involve creating and destroying tasks which requires * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while * write-locking threadgroup_rwsem, the locking order is reversed and we end up * waiting for an on-going CPU hotplug operation which in turn is waiting for * the threadgroup_rwsem to be released to create new tasks. For more details: * * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu * * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ void cgroup_attach_lock(bool lock_threadgroup) { cpus_read_lock(); if (lock_threadgroup) percpu_down_write(&cgroup_threadgroup_rwsem); } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem */ void cgroup_attach_unlock(bool lock_threadgroup) { if (lock_threadgroup) percpu_up_write(&cgroup_threadgroup_rwsem); cpus_read_unlock(); } /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context * * Add @task, which is a migration target, to @mgctx->tset. This function * becomes noop if @task doesn't need to be migrated. @task's css_set * should have been added as a migration source and @task->cg_list will be * moved from the css_set's tasks list to mg_tasks one. */ static void cgroup_migrate_add_task(struct task_struct *task, struct cgroup_mgctx *mgctx) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) return; /* cgroup_threadgroup_rwsem protects racing against forks */ WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) return; mgctx->tset.nr_tasks++; list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) list_add_tail(&cset->mg_dst_cset->mg_node, &mgctx->tset.dst_csets); } /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); else task = list_next_entry(task, cg_list); if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; /* * This function may be called both before and * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ if (cset->mg_dst_cset) *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; else *dst_cssp = cset->subsys[tset->ssid]; return task; } cset = list_next_entry(cset, mg_node); task = NULL; } return NULL; } /** * cgroup_migrate_execute - migrate a taskset * @mgctx: migration context * * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and * guarantees that either all or none of the tasks in @mgctx are migrated. * @mgctx is consumed regardless of success. */ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; /* check that we can legitimately attach to the cgroup */ if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); if (ret) { failed_ssid = ssid; goto out_cancel_attach; } } } while_each_subsys_mask(); } /* * Now that we're guaranteed success, proceed to move all tasks to * the new cgroup. There are no failure cases after here, so this * is the commit point. */ spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); from_cset->nr_tasks--; /* * If the source or destination cgroup is frozen, * the task might require to change its state. */ cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, to_cset->dfl_cgrp); put_css_set_locked(from_cset); } } spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. * Nothing is sensitive to fork() after this point. Notify * controllers that migration is complete. */ tset->csets = &tset->dst_csets; if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); } } while_each_subsys_mask(); } ret = 0; goto out_release_tset; out_cancel_attach: if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { tset->ssid = ssid; ss->cancel_attach(tset); } } while_each_subsys_mask(); } out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); /* * Re-initialize the cgroup_taskset structure in case it is reused * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() * iteration. */ tset->nr_tasks = 0; tset->csets = &tset->src_csets; return ret; } /** * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * * On the default hierarchy, except for the mixable, (possible) thread root * and threaded cgroups, subtree_control must be zero for migration * destination cgroups with tasks so that child cgroups don't compete * against tasks. */ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { /* v1 doesn't have any restriction */ if (!cgroup_on_dfl(dst_cgrp)) return 0; /* verify @dst_cgrp can host resources */ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. */ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) return 0; /* apply no-internal-process constraint */ if (dst_cgrp->subtree_control) return -EBUSY; return 0; } /** * cgroup_migrate_finish - cleanup after attach * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_src_preload_node); put_css_set_locked(cset); } list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } spin_unlock_irq(&css_set_lock); } /** * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem * even if the target is a process. Threads may be created and destroyed * but as long as cgroup_mutex is not dropped, no new css_set can be put * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); /* * If ->dead, @src_set is associated with one or more dead cgroups * and doesn't contain any migratable tasks. Ignore it early so * that the rest of migration path doesn't get confused by it. */ if (src_cset->dead) return; if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been * preloaded to @mgctx->preloaded_src_csets. This function looks up and * pins all destination css_sets, links each to its source, and append them * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @mgctx. */ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); /* * If src cset equals dst, it's noop. Drop the src. * cgroup_migrate() will skip the cset too. Note that we * can't handle src == dst as some nodes are used by both. */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; } src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_dst_preload_node)) list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); for_each_subsys(ss, ssid) if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) mgctx->ss_mask |= 1 << ssid; } return 0; } /** * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * * As long as a controller's ->can_attach() doesn't fail, this function is * guaranteed to succeed. This means that, excluding ->can_attach() * failure, when migrating multiple targets, the success or failure can be * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx) { struct task_struct *task; /* * The following thread iteration should be inside an RCU critical * section to prevent tasks from being freed while taking the snapshot. * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); } /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @dst_cgrp: the cgroup to attach to * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) ret = cgroup_migrate(leader, threadgroup, &mgctx); cgroup_migrate_finish(&mgctx); if (!ret) TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup); return ret; } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); /* * If we migrate a single thread, we don't care about threadgroup * stability. If the thread is `current`, it won't exit(2) under our * hands or change PID through exec(2). We exclude * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write * callers by cgroup_mutex. * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); *threadgroup_locked = pid || threadgroup; cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); goto out_unlock_threadgroup; } } else { tsk = current; } if (threadgroup) tsk = tsk->group_leader; /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); goto out_unlock_threadgroup; } get_task_struct(tsk); goto out_unlock_rcu; out_unlock_threadgroup: cgroup_attach_unlock(*threadgroup_locked); *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; /* release reference from cgroup_procs_write_start() */ put_task_struct(task); cgroup_attach_unlock(threadgroup_locked); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_puts(seq, ss->name); printed = true; } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } /* show controllers which are enabled for a given cgroup's children */ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } /** * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * * @cgrp's control masks have changed and its subtree's css associations * need to be updated accordingly. This function looks up all css_sets * which are attached to the subtree, creates the matching updated css_sets * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; /* * As cgroup_update_dfl_csses() is only called by * cgroup_apply_control(). The csses associated with the * given cgrp will not be affected by changes made to * its subtree_control file. We can skip them. */ if (dsct == cgrp) continue; list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* * We need to write-lock threadgroup_rwsem while migrating tasks. * However, if there are no source csets for @cgrp, changing its * controllers isn't gonna produce any task migrations and the * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); cgroup_attach_lock(has_tasks); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(has_tasks); return ret; } /** * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; restart: cgroup_lock(); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); DEFINE_WAIT(wait); if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); cgroup_unlock(); schedule(); finish_wait(&dsct->offline_waitq, &wait); cgroup_put(dsct); goto restart; } } } /** * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_save_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; dsct->old_dom_cgrp = dsct->dom_cgrp; } } /** * cgroup_propagate_control - refresh control masks of a subtree * @cgrp: root of the target subtree * * For @cgrp and its subtree, ensure ->subtree_ss_mask matches * ->subtree_control and propagate controller availability through the * subtree so that descendants don't have unavailable controllers enabled. */ static void cgroup_propagate_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->subtree_control &= cgroup_control(dsct); dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct->subtree_control, cgroup_ss_mask(dsct)); } } /** * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; dsct->dom_cgrp = dsct->old_dom_cgrp; } } static bool css_visible(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; if (cgroup_control(cgrp) & (1 << ss->id)) return true; if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) return false; return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; } /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and create new csses or make the existing ones * visible. A css is created invisible if it's being implicitly enabled * through dependency. An invisible css is made visible when the userland * explicitly enables it. * * Returns 0 on success, -errno on failure. On failure, csses which have * been processed already aren't cleaned up. The caller is responsible for * cleaning up with cgroup_apply_control_disable(). */ static int cgroup_apply_control_enable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid, ret; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; if (!css) { css = css_create(dsct, ss); if (IS_ERR(css)) return PTR_ERR(css); } WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css_visible(css)) { ret = css_populate_dir(css); if (ret) return ret; } } } return 0; } /** * cgroup_apply_control_disable - kill or hide csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and kill and hide csses so that they match * cgroup_ss_mask() and cgroup_visible_mask(). * * A css is hidden when the userland requests it to be disabled while other * subsystems are still depending on it. The css must not actively control * resources and be in the vanilla state if it's made visible again later. * Controllers which may be depended upon should provide ->css_reset() for * this purpose. */ static void cgroup_apply_control_disable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!css) continue; WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); } } } } /** * cgroup_apply_control - apply control mask updates to the subtree * @cgrp: root of the target subtree * * subsystems can be enabled and disabled in a subtree using the following * steps. * * 1. Call cgroup_save_control() to stash the current state. * 2. Update ->subtree_control masks in the subtree as desired. * 3. Call cgroup_apply_control() to apply the changes. * 4. Optionally perform other related operations. * 5. Call cgroup_finalize_control() to finish up. * * This function implements step 3 and propagates the mask changes * throughout @cgrp's subtree, updates csses accordingly and perform * process migrations. */ static int cgroup_apply_control(struct cgroup *cgrp) { int ret; cgroup_propagate_control(cgrp); ret = cgroup_apply_control_enable(cgrp); if (ret) return ret; /* * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ return cgroup_update_dfl_csses(cgrp); } /** * cgroup_finalize_control - finalize control mask update * @cgrp: root of the target subtree * @ret: the result of the update * * Finalize control mask update. See cgroup_apply_control() for more info. */ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) { if (ret) { cgroup_restore_control(cgrp); cgroup_propagate_control(cgrp); } cgroup_apply_control_disable(cgrp); } static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) { u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) return 0; /* can @cgrp host any resources? */ if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) return -EOPNOTSUPP; /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return 0; if (domain_enable) { /* can't enable domain controllers inside a thread subtree */ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return -EOPNOTSUPP; } else { /* * Threaded controllers can handle internal competitions * and are always allowed inside a (prospective) thread * subtree. */ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return 0; } /* * Controllers can't be enabled for a cgroup with tasks to avoid * child cgroups competing against tasks. */ if (cgroup_has_tasks(cgrp)) return -EBUSY; return 0; } /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret; /* * Parse input - space separated list of subsystem names prefixed * with either + or -. */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { enable |= 1 << ssid; disable &= ~(1 << ssid); } else if (*tok == '-') { disable |= 1 << ssid; enable &= ~(1 << ssid); } else { return -EINVAL; } break; } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } } } } if (!enable && !disable) { ret = 0; goto out_unlock; } ret = cgroup_vet_subtree_control_enable(cgrp, enable); if (ret) goto out_unlock; /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); cgrp->subtree_control |= enable; cgrp->subtree_control &= ~disable; ret = cgroup_apply_control(cgrp); cgroup_finalize_control(cgrp, ret); if (ret) goto out_unlock; kernfs_activate(cgrp->kn); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } /** * cgroup_enable_threaded - make @cgrp threaded * @cgrp: the target cgroup * * Called when "threaded" is written to the cgroup.type interface file and * tries to make @cgrp threaded and join the parent's resource domain. * This function is never called on the root cgroup as cgroup.type doesn't * exist on it. */ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; struct cgroup *dsct; struct cgroup_subsys_state *d_css; int ret; lockdep_assert_held(&cgroup_mutex); /* noop if already threaded */ if (cgroup_is_threaded(cgrp)) return 0; /* * If @cgroup is populated or has domain controllers enabled, it * can't be switched. While the below cgroup_can_be_thread_root() * test can catch the same conditions, that's only when @parent is * not mixable, so let's check it explicitly. */ if (cgroup_is_populated(cgrp) || cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return -EOPNOTSUPP; /* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) || !cgroup_can_be_thread_root(dom_cgrp)) return -EOPNOTSUPP; /* * The following shouldn't cause actual migrations and should * always succeed. */ cgroup_save_control(cgrp); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) if (dsct == cgrp || cgroup_is_threaded(dsct)) dsct->dom_cgrp = dom_cgrp; ret = cgroup_apply_control(cgrp); if (!ret) parent->nr_threaded_children++; cgroup_finalize_control(cgrp, ret); return ret; } static int cgroup_type_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; if (cgroup_is_threaded(cgrp)) seq_puts(seq, "threaded\n"); else if (!cgroup_is_valid_domain(cgrp)) seq_puts(seq, "domain invalid\n"); else if (cgroup_is_thread_root(cgrp)) seq_puts(seq, "domain threaded\n"); else seq_puts(seq, "domain\n"); return 0; } static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int ret; /* only switching to threaded mode is supported */ if (strcmp(strstrip(buf), "threaded")) return -EINVAL; /* drain dying csses before we re-apply (threaded) subtree control */ cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENOENT; /* threaded can only be enabled */ ret = cgroup_enable_threaded(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_max_descendants_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int descendants = READ_ONCE(cgrp->max_descendants); if (descendants == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", descendants); return 0; } static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int descendants; ssize_t ret; buf = strstrip(buf); if (!strcmp(buf, "max")) { descendants = INT_MAX; } else { ret = kstrtoint(buf, 0, &descendants); if (ret) return ret; } if (descendants < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_descendants = descendants; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_max_depth_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int depth = READ_ONCE(cgrp->max_depth); if (depth == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", depth); return 0; } static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int depth; buf = strstrip(buf); if (!strcmp(buf, "max")) { depth = INT_MAX; } else { ret = kstrtoint(buf, 0, &depth); if (ret) return ret; } if (depth < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_depth = depth; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_events_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); return 0; } static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; struct cgroup_subsys_state *css; int dying_cnt[CGROUP_SUBSYS_COUNT]; int ssid; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); /* * Show the number of live and dying csses associated with each of * non-inhibited cgroup subsystems that is bound to cgroup v2. * * Without proper lock protection, racing is possible. So the * numbers may not be consistent when that happens. */ rcu_read_lock(); for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { dying_cnt[ssid] = -1; if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) || (cgroup_subsys[ssid]->root != &cgrp_dfl_root)) continue; css = rcu_dereference_raw(cgroup->subsys[ssid]); dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid]; seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name, css ? (css->nr_descendants + 1) : 0); } seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { if (dying_cnt[ssid] >= 0) seq_printf(seq, "nr_dying_subsys_%s %d\n", cgroup_subsys[ssid]->name, dying_cnt[ssid]); } rcu_read_unlock(); return 0; } #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get @cgrp's css associated with @ss. If the css doesn't exist * or is offline, %NULL is returned. */ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; rcu_read_lock(); css = cgroup_css(cgrp, ss); if (css && !css_tryget_online(css)) css = NULL; rcu_read_unlock(); return css; } static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_extra_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_extra_stat_show(seq, css); css_put(css); return ret; } static int cgroup_local_stat_show(struct seq_file *seq, struct cgroup *cgrp, int ssid) { struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_local_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_local_stat_show(seq, css); css_put(css); return ret; } #endif static int cpu_stat_show(struct seq_file *seq, void *v) { int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); #endif return ret; } static int cpu_local_stat_show(struct seq_file *seq, void *v) { struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; #ifdef CONFIG_CGROUP_SCHED ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); #endif return ret; } #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; cgroup_get(cgrp); cgroup_kn_unlock(of->kn); /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { cgroup_put(cgrp); return -EBUSY; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); } smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; } static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_CPU); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IRQ); } static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IRQ); } #endif static int cgroup_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); seq_printf(seq, "%d\n", psi->enabled); return 0; } static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret; int enable; struct cgroup *cgrp; struct psi_group *psi; ret = kstrtoint(strstrip(buf), 0, &enable); if (ret) return ret; if (enable < 0 || enable > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; psi = cgroup_psi(cgrp); if (psi->enabled != enable) { int i; /* show or hide {cpu,memory,io,irq}.pressure files */ for (i = 0; i < NR_PSI_RESOURCES; i++) cgroup_file_show(&cgrp->psi_files[i], enable); psi->enabled = enable; if (enable) psi_cgroup_restart(psi); } cgroup_kn_unlock(of->kn); return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { struct cgroup_file_ctx *ctx = of->priv; return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) { if (static_branch_likely(&psi_disabled)) return false; return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } #else /* CONFIG_PSI */ bool cgroup_psi_enabled(void) { return false; } #endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "%d\n", cgrp->freezer.freeze); return 0; } static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int freeze; ret = kstrtoint(strstrip(buf), 0, &freeze); if (ret) return ret; if (freeze < 0 || freeze > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgroup_freeze(cgrp, freeze); cgroup_kn_unlock(of->kn); return nbytes; } static void __cgroup_kill(struct cgroup *cgrp) { struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); while ((task = css_task_iter_next(&it))) { /* Ignore kernel threads here. */ if (task->flags & PF_KTHREAD) continue; /* Skip tasks that are already dying. */ if (__fatal_signal_pending(task)) continue; send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); } static void cgroup_kill(struct cgroup *cgrp) { struct cgroup_subsys_state *css; struct cgroup *dsct; lockdep_assert_held(&cgroup_mutex); cgroup_for_each_live_descendant_pre(dsct, css, cgrp) __cgroup_kill(dsct); } static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret = 0; int kill; struct cgroup *cgrp; ret = kstrtoint(strstrip(buf), 0, &kill); if (ret) return ret; if (kill != 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; /* * Killing is a process directed operation, i.e. the whole thread-group * is taken down so act like we do for cgroup.procs and only make this * writable in non-threaded cgroups. */ if (cgroup_is_threaded(cgrp)) ret = -EOPNOTSUPP; else cgroup_kill(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); of->priv = ctx; if (!cft->open) return 0; ret = cft->open(of); if (ret) { put_cgroup_ns(ctx->ns); kfree(ctx); } return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; if (!nbytes) return 0; /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) return cft->write(of, buf, nbytes, off); /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and * doesn't need to be pinned. The RCU locking is not necessary * either. It's just for the convenience of using cgroup_css(). */ rcu_read_lock(); css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) ret = cft->write_u64(css, cft, v); } else if (cft->write_s64) { long long v; ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); } else { ret = -EINVAL; } return ret ?: nbytes; } static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) { struct cftype *cft = of_cft(of); if (cft->poll) return cft->poll(of, pt); return kernfs_generic_poll(of, pt); } static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { if (seq_cft(seq)->seq_stop) seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cftype *cft = seq_cft(m); struct cgroup_subsys_state *css = seq_css(m); if (cft->seq_show) return cft->seq_show(m, arg); if (cft->read_u64) seq_printf(m, "%llu\n", cft->read_u64(css, cft)); else if (cft->read_s64) seq_printf(m, "%lld\n", cft->read_s64(css, cft)); else return -EINVAL; return 0; } static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, .seq_show = cgroup_seqfile_show, }; static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, notify_timer)); } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), current_fsuid(), current_fsgid(), 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); } return 0; } /** * cgroup_addrm_files - add or remove files to a cgroup directory * @css: the target css * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. * For removals, this function never fails. */ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add) { struct cftype *cft, *cft_end = NULL; int ret = 0; lockdep_assert_held(&cgroup_mutex); restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); cft_end = cft; is_add = false; goto restart; } } else { cgroup_rm_file(cgrp, cft); } } return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; int ret = 0; lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } if (is_add && !ret) kernfs_activate(root->kn); return ret; } static void cgroup_exit_cftypes(struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft->name[0] != '\0'; cft++) { /* free copy for custom atomic_write_len, see init_cftypes() */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL | __CFTYPE_ADDED); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; int ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); if (cft->flags & __CFTYPE_ADDED) { ret = -EBUSY; break; } if (cft->seq_start) kf_ops = &cgroup_kf_ops; else kf_ops = &cgroup_kf_single_ops; /* * Ugh... if @cft wants a custom max_write_len, we need to * make a copy of kf_ops to set its atomic_write_len. */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { ret = -ENOMEM; break; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; cft->flags |= __CFTYPE_ADDED; } if (ret) cgroup_exit_cftypes(cfts); return ret; } static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); } /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem * @cfts: zero-length name terminated array of cftypes * * Unregister @cfts. Files described by @cfts are removed from all * existing cgroups and all future cgroups won't have them either. This * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered. */ int cgroup_rm_cftypes(struct cftype *cfts) { if (!cfts || cfts[0].name[0] == '\0') return 0; if (!(cfts[0].flags & __CFTYPE_ADDED)) return -ENOENT; cgroup_lock(); cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return 0; } /** * cgroup_add_cftypes - add an array of cftypes to a subsystem * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Register @cfts to @ss. Files described by @cfts are created for all * existing cgroups to which @ss is attached and all future cgroups will * have them too. This function can be called anytime whether @ss is * attached or not. * * Returns 0 on successful registration, -errno on failure. Note that this * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') return 0; ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret; cgroup_lock(); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return ret; } /** * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the default hierarchy. */ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_ONLY_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the legacy hierarchies. */ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_file_notify - generate a file modified event for a cgroup_file * @cfile: target cgroup_file * * @cfile must have been obtained by setting cftype->file_offset. */ void cgroup_file_notify(struct cgroup_file *cfile) { unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); if (cfile->kn) { unsigned long last = cfile->notified_at; unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; if (time_in_range(jiffies, last, next)) { timer_reduce(&cfile->notify_timer, next); } else { kernfs_notify(cfile->kn); cfile->notified_at = jiffies; } } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } /** * cgroup_file_show - show or hide a hidden cgroup file * @cfile: target cgroup_file obtained by setting cftype->file_offset * @show: whether to show or hide */ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; spin_lock_irq(&cgroup_file_kn_lock); kn = cfile->kn; kernfs_get(kn); spin_unlock_irq(&cgroup_file_kn_lock); if (kn) kernfs_show(kn, show); kernfs_put(kn); } /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk * * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is * that @parent and @pos are accessible. The next sibling is guaranteed to * be returned regardless of their states. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been unlinked from the sibling list. * Once a cgroup is removed, its ->sibling.next is no longer * updated when its next sibling changes. CSS_RELEASED is set when * @pos is taken off list, at which time its next pointer is valid, * and, as releases are serialized, the one pointed to by the next * pointer is guaranteed to not have started release yet. This * implies that if we observe !CSS_RELEASED on @pos in this RCU * critical section, the one pointed to by its next pointer is * guaranteed to not have finished its RCU grace period even if we * have dropped rcu_read_lock() in-between iterations. * * If @pos has CSS_RELEASED set, its next pointer can't be * dereferenced; however, as each css is given a monotonically * increasing unique serial number and always appended to the * sibling list, the next one can be found by walking the parent's * children until the first css with higher serial number than * @pos's. While this path can be slower, it happens iff iteration * races against release and the race window is very small. */ if (!pos) { next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); } else if (likely(!(pos->flags & CSS_RELEASED))) { next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { list_for_each_entry_rcu(next, &parent->children, sibling, lockdep_is_held(&cgroup_mutex)) if (next->serial_nr > pos->serial_nr) break; } /* * @next, if not pointing to the head, can be dereferenced and is * the next sibling. */ if (&next->sibling != &parent->children) return next; return NULL; } /** * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. Additionally, it isn't necessary to hold onto a reference to @pos. * This function will return the correct next descendant as long as both @pos * and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) return root; /* visit the first child if exists */ next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { next = css_next_child(pos, pos->parent); if (next) return next; pos = pos->parent; } return NULL; } EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css * @pos: css of interest * * Return the rightmost descendant of @pos. If there's no descendant, @pos * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. Additionally, it isn't necessary to hold onto a reference to @pos. * This function will return the correct rightmost descendant as long as @pos * is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; cgroup_assert_mutex_or_rcu_locked(); do { last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last; do { last = pos; pos = css_next_child(NULL, pos); } while (pos); return last; } /** * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. Additionally, it isn't necessary to hold onto a reference to @pos. * This function will return the correct next descendant as long as both @pos * and @cgroup are accessible and @pos is a descendant of @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ return pos->parent; } /** * css_has_online_children - does a css have online children * @css: the target css * * Returns %true if @css has any online children; otherwise, %false. This * function can be called from any context but the caller is responsible * for synchronizing against on/offlining as necessary. */ bool css_has_online_children(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *child; bool ret = false; rcu_read_lock(); css_for_each_child(child, css) { if (child->flags & CSS_ONLINE) { ret = true; break; } } rcu_read_unlock(); return ret; } static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) { struct list_head *l; struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* find the next threaded cset */ if (it->tcset_pos) { l = it->tcset_pos->next; if (l != it->tcset_head) { it->tcset_pos = l; return container_of(l, struct css_set, threaded_csets_node); } it->tcset_pos = NULL; } /* find the next cset */ l = it->cset_pos; l = l->next; if (l == it->cset_head) { it->cset_pos = NULL; return NULL; } if (it->ss) { cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); } else { link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } it->cset_pos = l; /* initialize threaded css_set walking */ if (it->flags & CSS_TASK_ITER_THREADED) { if (it->cur_dcset) put_css_set_locked(it->cur_dcset); it->cur_dcset = cset; get_css_set(cset); it->tcset_head = &cset->threaded_csets; it->tcset_pos = &cset->threaded_csets; } return cset; } /** * css_task_iter_advance_css_set - advance a task iterator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set and find first non-empty tasks list*/ while ((cset = css_task_iter_next_css_set(it))) { if (!list_empty(&cset->tasks)) { it->cur_tasks_head = &cset->tasks; break; } else if (!list_empty(&cset->mg_tasks)) { it->cur_tasks_head = &cset->mg_tasks; break; } else if (!list_empty(&cset->dying_tasks)) { it->cur_tasks_head = &cset->dying_tasks; break; } } if (!cset) { it->task_pos = NULL; return; } it->task_pos = it->cur_tasks_head->next; /* * We don't keep css_sets locked across iteration steps and thus * need to take steps to ensure that iteration can be resumed after * the lock is re-acquired. Iteration is performed at two levels - * css_sets and tasks in them. * * Once created, a css_set never leaves its cgroup lists, so a * pinned css_set is guaranteed to stay put and we can resume * iteration afterwards. * * Tasks may leave @cset across iteration steps. This is resolved * by registering each iterator with the css_set currently being * walked and making css_set_move_task() advance iterators whose * next task is leaving. */ if (it->cur_cset) { list_del(&it->iters_node); put_css_set_locked(it->cur_cset); } get_css_set(cset); it->cur_cset = cset; list_add(&it->iters_node, &cset->task_iters); } static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task) { lockdep_assert_held(&css_set_lock); if (it->task_pos == &task->cg_list) { it->task_pos = it->task_pos->next; it->flags |= CSS_TASK_ITER_SKIPPED; } } static void css_task_iter_advance(struct css_task_iter *it) { struct task_struct *task; lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { /* * Advance iterator to find next entry. We go through cset * tasks, mg_tasks and dying_tasks, when consumed we move onto * the next cset. */ if (it->flags & CSS_TASK_ITER_SKIPPED) it->flags &= ~CSS_TASK_ITER_SKIPPED; else it->task_pos = it->task_pos->next; if (it->task_pos == &it->cur_cset->tasks) { it->cur_tasks_head = &it->cur_cset->mg_tasks; it->task_pos = it->cur_tasks_head->next; } if (it->task_pos == &it->cur_cset->mg_tasks) { it->cur_tasks_head = &it->cur_cset->dying_tasks; it->task_pos = it->cur_tasks_head->next; } if (it->task_pos == &it->cur_cset->dying_tasks) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } if (!it->task_pos) return; task = list_entry(it->task_pos, struct task_struct, cg_list); if (it->flags & CSS_TASK_ITER_PROCS) { /* if PROCS, skip over tasks which aren't group leaders */ if (!thread_group_leader(task)) goto repeat; /* and dying leaders w/o live member threads */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks && !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks) goto repeat; } } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call * css_task_iter_next() to walk through the tasks until the function * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { unsigned long irqflags; memset(it, 0, sizeof(*it)); spin_lock_irqsave(&css_set_lock, irqflags); it->ss = css->ss; it->flags = flags; if (CGROUP_HAS_SUBSYS_CONFIG && it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; else it->cset_pos = &css->cgroup->cset_links; it->cset_head = it->cset_pos; css_task_iter_advance(it); spin_unlock_irqrestore(&css_set_lock, irqflags); } /** * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via css_task_iter_start(). Returns NULL when the iteration * reaches the end. */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { unsigned long irqflags; if (it->cur_task) { put_task_struct(it->cur_task); it->cur_task = NULL; } spin_lock_irqsave(&css_set_lock, irqflags); /* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED) css_task_iter_advance(it); if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); get_task_struct(it->cur_task); css_task_iter_advance(it); } spin_unlock_irqrestore(&css_set_lock, irqflags); return it->cur_task; } /** * css_task_iter_end - finish task iteration * @it: the task iterator to finish * * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) { unsigned long irqflags; if (it->cur_cset) { spin_lock_irqsave(&css_set_lock, irqflags); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); spin_unlock_irqrestore(&css_set_lock, irqflags); } if (it->cur_dcset) put_css_set(it->cur_dcset); if (it->cur_task) put_task_struct(it->cur_task); } static void cgroup_procs_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; if (ctx->procs.started) css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, unsigned int iter_flags) { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_file_ctx *ctx = of->priv; struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); css_task_iter_start(&cgrp->self, iter_flags, it); ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); } else return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) { struct cgroup *cgrp = seq_css(s)->cgroup; /* * All processes of a threaded subtree belong to the domain cgroup * of the subtree. Only threads can be distributed across the * subtree. Reject reads on cgroup.procs in the subtree proper. * They're always empty anyway. */ if (cgroup_is_threaded(cgrp)) return ERR_PTR(-EOPNOTSUPP); return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED); } static int cgroup_procs_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", task_pid_vnr(v)); return 0; } static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) { int ret; struct inode *inode; lockdep_assert_held(&cgroup_mutex); inode = kernfs_get_inode(sb, cgrp->procs_file.kn); if (!inode) return -ENOMEM; ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); iput(inode); return ret; } static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, struct cgroup_namespace *ns) { struct cgroup *com_cgrp = src_cgrp; int ret; lockdep_assert_held(&cgroup_mutex); /* find the common ancestor */ while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) com_cgrp = cgroup_parent(com_cgrp); /* %current should be authorized to migrate to the common ancestor */ ret = cgroup_may_write(com_cgrp, sb); if (ret) return ret; /* * If namespaces are delegation boundaries, %current must be able * to see both source and destination cgroups from its namespace. */ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) return -ENOENT; return 0; } static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, bool threadgroup, struct cgroup_namespace *ns) { int ret = 0; ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; ret = cgroup_migrate_vet_dst(dst_cgrp); if (ret) return ret; if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) ret = -EOPNOTSUPP; return ret; } static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; const struct cred *saved_cred; ssize_t ret; bool threadgroup_locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* find the source cgroup */ spin_lock_irq(&css_set_lock); src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); /* * Process and thread migrations follow same delegation rule. Check * permissions using the credentials from file open to protect against * inherited fd attacks. */ saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, of->file->f_path.dentry->d_sb, threadgroup, ctx->ns); revert_creds(saved_cred); if (ret) goto out_finish; ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, threadgroup_locked); out_unlock: cgroup_kn_unlock(of->kn); return ret; } static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup_procs_write(of, buf, true) ?: nbytes; } static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) { return __cgroup_procs_start(s, pos, 0); } static ssize_t cgroup_threads_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup_procs_write(of, buf, false) ?: nbytes; } /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_base_files[] = { { .name = "cgroup.type", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_type_show, .write = cgroup_type_write, }, { .name = "cgroup.procs", .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_procs_write, }, { .name = "cgroup.threads", .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_threads_write, }, { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, { .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cgroup, events_file), .seq_show = cgroup_events_show, }, { .name = "cgroup.max.descendants", .seq_show = cgroup_max_descendants_show, .write = cgroup_max_descendants_write, }, { .name = "cgroup.max.depth", .seq_show = cgroup_max_depth_show, .write = cgroup_max_depth_write, }, { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_freeze_show, .write = cgroup_freeze_write, }, { .name = "cgroup.kill", .flags = CFTYPE_NOT_ON_ROOT, .write = cgroup_kill_write, }, { .name = "cpu.stat", .seq_show = cpu_stat_show, }, { .name = "cpu.stat.local", .seq_show = cpu_local_stat_show, }, { } /* terminate */ }; static struct cftype cgroup_psi_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, #ifdef CONFIG_IRQ_TIME_ACCOUNTING { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, #endif { .name = "cgroup.pressure", .seq_show = cgroup_pressure_show, .write = cgroup_pressure_write, }, #endif /* CONFIG_PSI */ { } /* terminate */ }; /* * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be * offlined by invoking offline_css(). After offlining, the base ref is * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional * steps to the already complex sequence. */ static void css_free_rwork_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); css_rstat_exit(css); if (!css_is_self(css)) { /* css free path */ struct cgroup_subsys_state *parent = css->parent; int id = css->id; ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); if (parent) css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); if (!cgroup_on_dfl(cgrp)) cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); if (cgroup_parent(cgrp)) { /* * We get a ref to the parent, and put the ref when * this cgroup is being freed, so it's guaranteed * that the parent won't be destroyed before its * children. */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); kfree(cgrp); } else { /* * This is root cgroup's refcnt reaching zero, * which indicates that the root should be * released. */ cgroup_destroy_root(cgrp->root); } } } static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; cgroup_lock(); css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); if (!css_is_self(css)) { struct cgroup *parent_cgrp; css_rstat_flush(css); cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); cgrp->nr_dying_subsys[ss->id]--; /* * When a css is released and ready to be freed, its * nr_descendants must be zero. However, the corresponding * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem * is activated and deactivated multiple times with one or * more of its previous activation leaving behind dying csses. */ WARN_ON_ONCE(css->nr_descendants); parent_cgrp = cgroup_parent(cgrp); while (parent_cgrp) { parent_cgrp->nr_dying_subsys[ss->id]--; parent_cgrp = cgroup_parent(parent_cgrp); } } else { struct cgroup *tcgrp; /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); css_rstat_flush(&cgrp->self); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - * cgroupstats_build() and css_tryget_online_from_dir(). * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); } cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { lockdep_assert_held(&cgroup_mutex); cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); css_get(css->parent); } BUG_ON(cgroup_css(cgrp, ss)); } /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); if (css->parent) { atomic_inc(&css->parent->online_cnt); while ((css = css->parent)) css->nr_descendants++; } } return ret; } /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) return; if (ss->css_offline) ss->css_offline(css); css->flags &= ~CSS_ONLINE; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); css->cgroup->nr_dying_subsys[ss->id]++; /* * Parent css and cgroup cannot be freed until after the freeing * of child css, see css_free_rwork_fn(). */ while ((css = css->parent)) { css->nr_descendants--; css->cgroup->nr_dying_subsys[ss->id]++; } } /** * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css * * Create a new css associated with @cgrp - @ss pair. On success, the new * css is online and installed in @cgrp. This function doesn't create the * interface files. Returns 0 on success, -errno on failure. */ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(parent_css); if (!css) css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) return css; init_and_link_css(css, ss, cgrp); err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) goto err_free_css; css->id = err; err = css_rstat_init(css); if (err) goto err_free_css; /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) goto err_list_del; return css; err_list_del: list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); } /* * The returned cgroup is fully initialized including its control mask, but * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; struct kernfs_node *kn; int i, level = parent->level + 1; int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; /* create the directory */ kn = kernfs_create_dir_ns(parent->kn, name, mode, current_fsuid(), current_fsgid(), cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); goto out_cancel_ref; } cgrp->kn = kn; init_cgroup_housekeeping(cgrp); cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; /* * Now that init_cgroup_housekeeping() has been called and cgrp->self * is setup, it is safe to perform rstat initialization on it. */ ret = css_rstat_init(&cgrp->self); if (ret) goto out_kernfs_remove; ret = psi_cgroup_alloc(cgrp); if (ret) goto out_stat_exit; for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) cgrp->ancestors[tcgrp->level] = tcgrp; /* * New cgroup inherits effective freeze counter, and * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be * attached to the child cgroup, it will become frozen. * At this point the new cgroup is unpopulated, so we can * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); set_bit(CGRP_FROZEN, &cgrp->flags); } if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); cgrp->self.serial_nr = css_serial_nr_next++; ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier, CGROUP_LIFETIME_ONLINE, CGROUP_LIFETIME_OFFLINE, cgrp); ret = notifier_to_errno(ret); if (ret) goto out_psi_free; /* allocation complete, commit to creation */ spin_lock_irq(&css_set_lock); for (i = 0; i < level; i++) { tcgrp = cgrp->ancestors[i]; tcgrp->nr_descendants++; /* * If the new cgroup is frozen, all ancestor cgroups get a new * frozen descendant, but their state can't change because of * this. */ if (cgrp->freezer.e_freeze) tcgrp->freezer.nr_frozen_descendants++; } spin_unlock_irq(&css_set_lock); list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. */ if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); cgroup_propagate_control(cgrp); return cgrp; out_psi_free: psi_cgroup_free(cgrp); out_stat_exit: css_rstat_exit(&cgrp->self); out_kernfs_remove: kernfs_remove(cgrp->kn); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); return ERR_PTR(ret); } static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; int level = 0; lockdep_assert_held(&cgroup_mutex); for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; if (level >= cgroup->max_depth) goto fail; level++; } ret = true; fail: return ret; } int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(name, '\n')) return -EINVAL; parent = cgroup_kn_lock_live(parent_kn, false); if (!parent) return -ENODEV; if (!cgroup_check_hierarchy_limits(parent)) { ret = -EAGAIN; goto out_unlock; } cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } /* * This extra ref will be put in css_free_rwork_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; ret = cgroup_apply_control_enable(cgrp); if (ret) goto out_destroy; TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; out_destroy: cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; } /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to * initiate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); cgroup_lock(); do { offline_css(css); css_put(css); /* @css can't go away while we're holding cgroup_mutex */ css = css->parent; } while (css && atomic_dec_and_test(&css->online_cnt)); cgroup_unlock(); } /* css kill confirmation processing requires process context, bounce */ static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } } /** * kill_css - destroy a css * @css: css to destroy * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked * asynchronously once css_tryget_online() is guaranteed to fail and when * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) return; /* * Call css_killed(), if defined, before setting the CSS_DYING flag */ if (css->ss->css_killed) css->ss->css_killed(css); css->flags |= CSS_DYING; /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline(). */ css_get(css); /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to * guarantee that css_tryget_online() won't succeed by the time * ->css_offline() is invoked. To satisfy all the requirements, * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of * css's. Set up so that the next stage will be kicked off once all * the percpu refcnts are confirmed to be killed. * * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the * rest of destruction. Once all cgroup references are gone, the * cgroup is RCU-freed. * * This function implements s1. After this step, @cgrp is gone as far as * the userland is concerned and a new cgroup with the same name may be * created. As cgroup doesn't care about the names internally, this * doesn't cause any problem. */ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid, ret; lockdep_assert_held(&cgroup_mutex); /* * Only migration can raise populated from zero and we're already * holding cgroup_mutex. */ if (cgroup_is_populated(cgrp)) return -EBUSY; /* * Make sure there's no live children. We can't test emptiness of * ->self.children as dead children linger on it while being * drained; otherwise, "rmdir parent/child parent" may fail. */ if (css_has_online_children(&cgrp->self)) return -EBUSY; /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) link->cset->dead = true; spin_unlock_irq(&css_set_lock); /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); if (cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* * If the dying cgroup is frozen, decrease frozen descendants * counters of ancestor cgroups. */ if (test_bit(CGRP_FROZEN, &cgrp->flags)) tcgrp->freezer.nr_frozen_descendants--; } spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, CGROUP_LIFETIME_OFFLINE, cgrp); WARN_ON_ONCE(notifier_to_errno(ret)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); return 0; }; int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; ret = cgroup_destroy_locked(cgrp); if (!ret) TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .show_options = cgroup_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; pr_debug("Initializing cgroup subsys %s\n", ss->name); cgroup_lock(); idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); /* * Root csses are never destroyed and we can't initialize * percpu_ref during early init. Disable refcnting. */ css->flags |= CSS_NO_REF; if (early) { /* allocation can't be done safely during early init */ css->id = 1; } else { css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); BUG_ON(ss_rstat_init(ss)); BUG_ON(css_rstat_init(css)); } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's root cgroup. */ init_css_set.subsys[ss->id] = css; have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); BUG_ON(online_css(css)); cgroup_unlock(); } /** * cgroup_init_early - cgroup initialization at system boot * * Initialize cgroups at system boot, and initialize any * subsystems that request early init. */ int __init cgroup_init_early(void) { static struct cgroup_fs_context __initdata ctx; struct cgroup_subsys *ss; int i; ctx.root = &cgrp_dfl_root; init_cgroup_root(&ctx); cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); WARN(ss->early_init && ss->css_rstat_flush, "cgroup rstat cannot be used with early init subsystem\n"); ss->id = i; ss->name = cgroup_subsys_name[i]; if (!ss->legacy_name) ss->legacy_name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss, true); } return 0; } /** * cgroup_init - cgroup initialization * * Register cgroup filesystem and /proc file, and initialize * any subsystems that didn't request early init. */ int __init cgroup_init(void) { struct cgroup_subsys *ss; int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); cgroup_lock(); /* * Add init_css_set to the hash table so that dfl_root can link to * it during init. */ hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); cgroup_bpf_lifetime_notifier_init(); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); cgroup_unlock(); for_each_subsys(ss, ssid) { if (ss->early_init) { struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } else { cgroup_init_subsys(ss, false); } list_add_tail(&init_css_set.e_cset_node[ssid], &cgrp_dfl_root.cgrp.e_csets[ssid]); /* * Setting dfl_root subsys_mask needs to consider the * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ if (!cgroup_ssid_enabled(ssid)) continue; if (cgroup1_ssid_disabled(ssid)) pr_info("Disabling %s control group subsystem in v1 mounts\n", ss->legacy_name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; /* implicit controllers must be threaded too */ WARN_ON(ss->implicit_on_dfl && !ss->threaded); if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->threaded) cgrp_dfl_threaded_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } if (ss->bind) ss->bind(init_css_set.subsys[ssid]); cgroup_lock(); css_populate_dir(init_css_set.subsys[ssid]); cgroup_unlock(); } /* init_css_set.subsys[] has been updated, re-hash */ hash_del(&init_css_set.hlist); hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); #ifdef CONFIG_CPUSETS_V1 WARN_ON(register_filesystem(&cpuset_fs_type)); #endif return 0; } static int __init cgroup_wq_init(void) { /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. * Use 1 for @max_active. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); return 0; } core_initcall(cgroup_wq_init); void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); kernfs_put(kn); } /* * cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure * Only cgroups within current task's cgroup NS are valid. */ struct cgroup *cgroup_get_from_id(u64 id) { struct kernfs_node *kn; struct cgroup *cgrp, *root_cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return ERR_PTR(-ENOENT); if (kernfs_type(kn) != KERNFS_DIR) { kernfs_put(kn); return ERR_PTR(-ENOENT); } rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp && !cgroup_tryget(cgrp)) cgrp = NULL; rcu_read_unlock(); kernfs_put(kn); if (!cgrp) return ERR_PTR(-ENOENT); root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { cgroup_put(cgrp); return ERR_PTR(-ENOENT); } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. */ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { char *buf; int retval; struct cgroup_root *root; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; rcu_read_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int ssid, count = 0; if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; cgrp = task_cgroup_from_root(tsk, root); /* The root has already been unmounted. */ if (!cgrp) continue; seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->legacy_name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, * while a zombie doesn't show up in "cgroup.procs" and * thus can't be migrated, its /proc/PID/cgroup keeps * reporting the cgroup it belonged to before exiting. If * the cgroup is removed before the zombie is reaped, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock; seq_puts(m, buf); } else { seq_puts(m, "/"); } if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else seq_putc(m, '\n'); } retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); rcu_read_unlock(); kfree(buf); out: return retval; } /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() * attaches it to the target css_set. */ void cgroup_fork(struct task_struct *child) { RCU_INIT_POINTER(child->cgroups, &init_css_set); INIT_LIST_HEAD(&child->cg_list); } /** * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer * @f: file corresponding to cgroup_dir * * Find the cgroup from a file pointer associated with a cgroup directory. * Returns a pointer to the cgroup on success. ERR_PTR is returned if the * cgroup cannot be found. */ static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); return css->cgroup; } /** * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports * cgroup2. * @f: file corresponding to cgroup2_dir */ static struct cgroup *cgroup_get_from_file(struct file *f) { struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); if (IS_ERR(cgrp)) return ERR_CAST(cgrp); if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); } return cgrp; } /** * cgroup_css_set_fork - find or create a css_set for a child process * @kargs: the arguments passed to create the child process * * This functions finds or creates a new css_set which the child * process will be attached to in cgroup_post_fork(). By default, * the child process will be given the same css_set as its parent. * * If CLONE_INTO_CGROUP is specified this function will try to find an * existing css_set which includes the requested cgroup and if not create * a new css_set that the child will be attached to later. If this function * succeeds it will hold cgroup_threadgroup_rwsem on return. If * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex * before grabbing cgroup_threadgroup_rwsem and will hold a reference * to the target cgroup. */ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) { int ret; struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb; if (kargs->flags & CLONE_INTO_CGROUP) cgroup_lock(); cgroup_threadgroup_change_begin(current); spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); if (kargs->cgrp) kargs->kill_seq = kargs->cgrp->kill_seq; else kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { kargs->cset = cset; return 0; } CLASS(fd_raw, f)(kargs->cgroup); if (fd_empty(f)) { ret = -EBADF; goto err; } sb = fd_file(f)->f_path.dentry->d_sb; dst_cgrp = cgroup_get_from_file(fd_file(f)); if (IS_ERR(dst_cgrp)) { ret = PTR_ERR(dst_cgrp); dst_cgrp = NULL; goto err; } if (cgroup_is_dead(dst_cgrp)) { ret = -ENODEV; goto err; } /* * Verify that we the target cgroup is writable for us. This is * usually done by the vfs layer but since we're not going through * the vfs layer here we need to do it "manually". */ ret = cgroup_may_write(dst_cgrp, sb); if (ret) goto err; /* * Spawning a task directly into a cgroup works by passing a file * descriptor to the target cgroup directory. This can even be an O_PATH * file descriptor. But it can never be a cgroup.procs file descriptor. * This was done on purpose so spawning into a cgroup could be * conceptualized as an atomic * * fd = openat(dfd_cgroup, "cgroup.procs", ...); * write(fd, <child-pid>, ...); * * sequence, i.e. it's a shorthand for the caller opening and writing * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us * to always use the caller's credentials. */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, !(kargs->flags & CLONE_THREAD), current->nsproxy->cgroup_ns); if (ret) goto err; kargs->cset = find_css_set(cset, dst_cgrp); if (!kargs->cset) { ret = -ENOMEM; goto err; } put_css_set(cset); kargs->cgrp = dst_cgrp; return ret; err: cgroup_threadgroup_change_end(current); cgroup_unlock(); if (dst_cgrp) cgroup_put(dst_cgrp); put_css_set(cset); if (kargs->cset) put_css_set(kargs->cset); return ret; } /** * cgroup_css_set_put_fork - drop references we took during fork * @kargs: the arguments passed to create the child process * * Drop references to the prepared css_set and target cgroup if * CLONE_INTO_CGROUP was requested. */ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { struct cgroup *cgrp = kargs->cgrp; struct css_set *cset = kargs->cset; cgroup_threadgroup_change_end(current); if (cset) { put_css_set(cset); kargs->cset = NULL; } if (kargs->flags & CLONE_INTO_CGROUP) { cgroup_unlock(); if (cgrp) { cgroup_put(cgrp); kargs->cgrp = NULL; } } } /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() * callback returns an error, the fork aborts with that error code. This * allows for a cgroup subsystem to conditionally allow or deny new forks. */ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i, j, ret; ret = cgroup_css_set_fork(kargs); if (ret) return ret; do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert; } while_each_subsys_mask(); return 0; out_revert: for_each_subsys(ss, j) { if (j >= i) break; if (ss->cancel_fork) ss->cancel_fork(child, kargs->cset); } cgroup_css_set_put_fork(kargs); return ret; } /** * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() * @child: the child process * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeeded and cleans up references we took to * prepare a new css_set for the child process in cgroup_can_fork(). */ void cgroup_cancel_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) ss->cancel_fork(child, kargs->cset); cgroup_css_set_put_fork(kargs); } /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. */ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; struct css_set *cset; int i; cset = kargs->cset; kargs->cset = NULL; spin_lock_irq(&css_set_lock); /* init tasks are special, only link regular threads */ if (likely(child->pid)) { if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; cgrp_kill_seq = kargs->cgrp->kill_seq; } else { cgrp_flags = cset->dfl_cgrp->flags; cgrp_kill_seq = cset->dfl_cgrp->kill_seq; } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } else { put_css_set(cset); cset = NULL; } if (!(child->flags & PF_KTHREAD)) { if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) { /* * If the cgroup has to be frozen, the new task has * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to * get the task into the frozen state. */ spin_lock(&child->sighand->siglock); WARN_ON_ONCE(child->frozen); child->jobctl |= JOBCTL_TRAP_FREEZE; spin_unlock(&child->sighand->siglock); /* * Calling cgroup_update_frozen() isn't required here, * because it will be called anyway a bit later from * do_freezer_trap(). So we avoid cgroup's transient * switch from the frozen state and back. */ } /* * If the cgroup is to be killed notice it now and take the * child down right after we finished preparing it for * userspace. */ kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); } while_each_subsys_mask(); /* Make the new cset the root_cset of the new cgroup namespace. */ if (kargs->flags & CLONE_NEWCGROUP) { struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; get_css_set(cset); child->nsproxy->cgroup_ns->root_cset = cset; put_css_set(rcset); } /* Cgroup has to be killed so take down child immediately. */ if (unlikely(kill)) do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID); cgroup_css_set_put_fork(kargs); } /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; int i; spin_lock_irq(&css_set_lock); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); css_set_move_task(tsk, cset, NULL, false); cset->nr_tasks--; /* matches the signal->live check in css_task_iter_advance() */ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); if (dl_task(tsk)) dec_dl_tasks_cs(tsk); WARN_ON_ONCE(cgroup_task_frozen(tsk)); if (unlikely(!(tsk->flags & PF_KTHREAD) && test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); } while_each_subsys_mask(); } void cgroup_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); if (!list_empty(&task->cg_list)) { spin_lock_irq(&css_set_lock); css_set_skip_task_iters(task_css_set(task), task); list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } } void cgroup_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); put_css_set(cset); } static int __init cgroup_disable(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; static_branch_disable(cgroup_subsys_enabled_key[i]); pr_info("Disabling %s control group subsystem\n", ss->name); } for (i = 0; i < OPT_FEATURE_COUNT; i++) { if (strcmp(token, cgroup_opt_feature_names[i])) continue; cgroup_feature_disable_mask |= 1 << i; pr_info("Disabling %s control group feature\n", cgroup_opt_feature_names[i]); break; } } return 1; } __setup("cgroup_disable=", cgroup_disable); void __init __weak enable_debug_cgroup(void) { } static int __init enable_cgroup_debug(char *str) { cgroup_debug = true; enable_debug_cgroup(); return 1; } __setup("cgroup_debug", enable_cgroup_debug); static int __init cgroup_favordynmods_setup(char *str) { return (kstrtobool(str, &have_favordynmods) == 0); } __setup("cgroup_favordynmods=", cgroup_favordynmods_setup); /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * * If @dentry is a directory for a cgroup which has @ss enabled on it, try * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); return css; } /** * css_from_id - lookup css by id * @id: the cgroup id * @ss: cgroup subsys to be looked into * * Returns the css if there's valid one with @id, otherwise returns NULL. * Should be called under rcu_read_lock(). */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); return idr_find(&ss->css_idr, id); } /** * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path * @path: path on the default hierarchy * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup *root_cgrp; root_cgrp = current_cgns_cgroup_dfl(); kn = kernfs_walk_and_get(root_cgrp->kn, path); if (!kn) goto out; if (kernfs_type(kn) != KERNFS_DIR) { cgrp = ERR_PTR(-ENOTDIR); goto out_kernfs; } rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) cgrp = ERR_PTR(-ENOENT); rcu_read_unlock(); out_kernfs: kernfs_put(kn); out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ struct cgroup *cgroup_v1v2_get_from_fd(int fd) { CLASS(fd_raw, f)(fd); if (fd_empty(f)) return ERR_PTR(-EBADF); return cgroup_v1v2_get_from_file(fd_file(f)); } /** * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports * cgroup2. * @fd: fd obtained by open(cgroup2_dir) */ struct cgroup *cgroup_get_from_fd(int fd) { struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgrp)) return ERR_CAST(cgrp); if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); static u64 power_of_ten(int power) { u64 v = 1; while (power--) v *= 10; return v; } /** * cgroup_parse_float - parse a floating number * @input: input string * @dec_shift: number of decimal digits to shift * @v: output * * Parse a decimal floating point number in @input and store the result in * @v with decimal point right shifted @dec_shift times. For example, if * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. * Returns 0 on success, -errno otherwise. * * There's nothing cgroup specific about this function except that it's * currently the only user. */ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) { s64 whole, frac = 0; int fstart = 0, fend = 0, flen; if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) return -EINVAL; if (frac < 0) return -EINVAL; flen = fend > fstart ? fend - fstart : 0; if (flen < dec_shift) frac *= power_of_ten(dec_shift - flen); else frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); *v = whole * power_of_ten(dec_shift) + frac; return 0; } /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { struct cgroup *cgroup; rcu_read_lock(); /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) { cgroup = &cgrp_dfl_root.cgrp; cgroup_get(cgroup); goto out; } while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { cgroup = cset->dfl_cgrp; break; } cpu_relax(); } out: skcd->cgroup = cgroup; cgroup_bpf_get(cgroup); rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); /* * We might be cloning a socket which is left in an empty * cgroup and the cgroup might have already been rmdir'd. * Don't use cgroup_get_live(). */ cgroup_get(cgrp); cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); cgroup_bpf_put(cgrp); cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_SYSFS static ssize_t show_delegatable_files(struct cftype *files, char *buf, ssize_t size, const char *prefix) { struct cftype *cft; ssize_t ret = 0; for (cft = files; cft && cft->name[0] != '\0'; cft++) { if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); if (WARN_ON(ret >= size)) break; } return ret; } static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct cgroup_subsys *ss; int ssid; ssize_t ret = 0; ret = show_delegatable_files(cgroup_base_files, buf + ret, PAGE_SIZE - ret, NULL); if (cgroup_psi_enabled()) ret += show_delegatable_files(cgroup_psi_files, buf + ret, PAGE_SIZE - ret, NULL); for_each_subsys(ss, ssid) ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, PAGE_SIZE - ret, cgroup_subsys_name[ssid]); return ret; } static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "nsdelegate\n" "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n" "memory_hugetlb_accounting\n" "pids_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); static struct attribute *cgroup_sysfs_attrs[] = { &cgroup_delegate_attr.attr, &cgroup_features_attr.attr, NULL, }; static const struct attribute_group cgroup_sysfs_attr_group = { .attrs = cgroup_sysfs_attrs, .name = "cgroup", }; static int __init cgroup_sysfs_init(void) { return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); } subsys_initcall(cgroup_sysfs_init); #endif /* CONFIG_SYSFS */
5 3 22 76 38 64 61 3 85 85 25 1 22 21 60 2 58 1 57 1 56 57 4 28 32 32 58 58 2 2 1 2 1 14 6 1 7 6 1 4 1 2 3 3 2 1 7 3 4 1 1 1 1 1 1 1 1 1 1 4 1 3 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/namei.h> #include <linux/pipe_fs_i.h> #include <linux/watch_queue.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "rsrc.h" #include "openclose.h" struct io_open { struct file *file; int dfd; u32 file_slot; struct filename *filename; struct open_how how; unsigned long nofile; }; struct io_close { struct file *file; int fd; u32 file_slot; }; struct io_fixed_install { struct file *file; unsigned int o_flags; }; static bool io_openat_force_async(struct io_open *open) { /* * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, * it'll always -EAGAIN. Note that we test for __O_TMPFILE because * O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force * async for. */ return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE); } static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); const char __user *fname; int ret; if (unlikely(sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; /* open.how should be already initialised */ if (!(open->how.flags & O_PATH) && force_o_largefile()) open->how.flags |= O_LARGEFILE; open->dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); open->filename = getname(fname); if (IS_ERR(open->filename)) { ret = PTR_ERR(open->filename); open->filename = NULL; return ret; } open->file_slot = READ_ONCE(sqe->file_index); if (open->file_slot && (open->how.flags & O_CLOEXEC)) return -EINVAL; open->nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; if (io_openat_force_async(open)) req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); u64 mode = READ_ONCE(sqe->len); u64 flags = READ_ONCE(sqe->open_flags); open->how = build_open_how(flags, mode); return __io_openat_prep(req, sqe); } int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); struct open_how __user *how; size_t len; int ret; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) return -EINVAL; ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len); if (ret) return ret; return __io_openat_prep(req, sqe); } int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); struct open_flags op; struct file *file; bool resolve_nonblock, nonblock_set; bool fixed = !!open->file_slot; int ret; ret = build_open_flags(&open->how, &op); if (ret) goto err; nonblock_set = op.open_flag & O_NONBLOCK; resolve_nonblock = open->how.resolve & RESOLVE_CACHED; if (issue_flags & IO_URING_F_NONBLOCK) { WARN_ON_ONCE(io_openat_force_async(open)); op.lookup_flags |= LOOKUP_CACHED; op.open_flag |= O_NONBLOCK; } if (!fixed) { ret = __get_unused_fd_flags(open->how.flags, open->nofile); if (ret < 0) goto err; } file = do_filp_open(open->dfd, open->filename, &op); if (IS_ERR(file)) { /* * We could hang on to this 'fd' on retrying, but seems like * marginal gain for something that is now known to be a slower * path. So just put it, and we'll get a new one when we retry. */ if (!fixed) put_unused_fd(ret); ret = PTR_ERR(file); /* only retry if RESOLVE_CACHED wasn't already set by application */ if (ret == -EAGAIN && (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) return -EAGAIN; goto err; } if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) file->f_flags &= ~O_NONBLOCK; if (!fixed) fd_install(ret, file); else ret = io_fixed_fd_install(req, issue_flags, file, open->file_slot); err: putname(open->filename); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_COMPLETE; } int io_openat(struct io_kiocb *req, unsigned int issue_flags) { return io_openat2(req, issue_flags); } void io_open_cleanup(struct io_kiocb *req) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); if (open->filename) putname(open->filename); } int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, unsigned int offset) { int ret; io_ring_submit_lock(ctx, issue_flags); ret = io_fixed_fd_remove(ctx, offset); io_ring_submit_unlock(ctx, issue_flags); return ret; } static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { struct io_close *close = io_kiocb_to_cmd(req, struct io_close); return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1); } int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_close *close = io_kiocb_to_cmd(req, struct io_close); if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; close->fd = READ_ONCE(sqe->fd); close->file_slot = READ_ONCE(sqe->file_index); if (close->file_slot && close->fd) return -EINVAL; return 0; } int io_close(struct io_kiocb *req, unsigned int issue_flags) { struct files_struct *files = current->files; struct io_close *close = io_kiocb_to_cmd(req, struct io_close); struct file *file; int ret = -EBADF; if (close->file_slot) { ret = io_close_fixed(req, issue_flags); goto err; } spin_lock(&files->file_lock); file = files_lookup_fd_locked(files, close->fd); if (!file || io_is_uring_fops(file)) { spin_unlock(&files->file_lock); goto err; } /* if the file has a flush method, be safe and punt to async */ if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { spin_unlock(&files->file_lock); return -EAGAIN; } file = file_close_fd_locked(files, close->fd); spin_unlock(&files->file_lock); if (!file) goto err; /* No ->flush() or already async, safely close from here */ ret = filp_close(file, current->files); err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_COMPLETE; } int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fixed_install *ifi; unsigned int flags; if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || sqe->splice_fd_in || sqe->addr3) return -EINVAL; /* must be a fixed file */ if (!(req->flags & REQ_F_FIXED_FILE)) return -EBADF; flags = READ_ONCE(sqe->install_fd_flags); if (flags & ~IORING_FIXED_FD_NO_CLOEXEC) return -EINVAL; /* ensure the task's creds are used when installing/receiving fds */ if (req->flags & REQ_F_CREDS) return -EPERM; /* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */ ifi = io_kiocb_to_cmd(req, struct io_fixed_install); ifi->o_flags = O_CLOEXEC; if (flags & IORING_FIXED_FD_NO_CLOEXEC) ifi->o_flags = 0; return 0; } int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_fixed_install *ifi; int ret; ifi = io_kiocb_to_cmd(req, struct io_fixed_install); ret = receive_fd(req->file, NULL, ifi->o_flags); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_COMPLETE; } struct io_pipe { struct file *file; int __user *fds; int flags; int file_slot; unsigned long nofile; }; int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); if (sqe->fd || sqe->off || sqe->addr3) return -EINVAL; p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr)); p->flags = READ_ONCE(sqe->pipe_flags); if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) return -EINVAL; p->file_slot = READ_ONCE(sqe->file_index); p->nofile = rlimit(RLIMIT_NOFILE); return 0; } static int io_pipe_fixed(struct io_kiocb *req, struct file **files, unsigned int issue_flags) { struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); struct io_ring_ctx *ctx = req->ctx; int ret, fds[2] = { -1, -1 }; int slot = p->file_slot; if (p->flags & O_CLOEXEC) return -EINVAL; io_ring_submit_lock(ctx, issue_flags); ret = __io_fixed_fd_install(ctx, files[0], slot); if (ret < 0) goto err; fds[0] = ret; files[0] = NULL; /* * If a specific slot is given, next one will be used for * the write side. */ if (slot != IORING_FILE_INDEX_ALLOC) slot++; ret = __io_fixed_fd_install(ctx, files[1], slot); if (ret < 0) goto err; fds[1] = ret; files[1] = NULL; io_ring_submit_unlock(ctx, issue_flags); if (!copy_to_user(p->fds, fds, sizeof(fds))) return 0; ret = -EFAULT; io_ring_submit_lock(ctx, issue_flags); err: if (fds[0] != -1) io_fixed_fd_remove(ctx, fds[0]); if (fds[1] != -1) io_fixed_fd_remove(ctx, fds[1]); io_ring_submit_unlock(ctx, issue_flags); return ret; } static int io_pipe_fd(struct io_kiocb *req, struct file **files) { struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); int ret, fds[2] = { -1, -1 }; ret = __get_unused_fd_flags(p->flags, p->nofile); if (ret < 0) goto err; fds[0] = ret; ret = __get_unused_fd_flags(p->flags, p->nofile); if (ret < 0) goto err; fds[1] = ret; if (!copy_to_user(p->fds, fds, sizeof(fds))) { fd_install(fds[0], files[0]); fd_install(fds[1], files[1]); return 0; } ret = -EFAULT; err: if (fds[0] != -1) put_unused_fd(fds[0]); if (fds[1] != -1) put_unused_fd(fds[1]); return ret; } int io_pipe(struct io_kiocb *req, unsigned int issue_flags) { struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); struct file *files[2]; int ret; ret = create_pipe_files(files, p->flags); if (ret) return ret; if (!!p->file_slot) ret = io_pipe_fixed(req, files, issue_flags); else ret = io_pipe_fd(req, files); io_req_set_res(req, ret, 0); if (!ret) return IOU_COMPLETE; req_set_fail(req); if (files[0]) fput(files[0]); if (files[1]) fput(files[1]); return ret; }
2 2 4 12 4 4 4 4 4 1 4 2 2 2 5 5 3 5 5 5 5 5 4 1 4 4 4 3 5 1 3 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" static inline bool devlink_rate_is_leaf(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; } static inline bool devlink_rate_is_node(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; } static struct devlink_rate * devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) { struct devlink_rate *devlink_rate; struct devlink_port *devlink_port; devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); if (IS_ERR(devlink_port)) return ERR_CAST(devlink_port); devlink_rate = devlink_port->devlink_rate; return devlink_rate ?: ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { static struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && !strcmp(node_name, devlink_rate->name)) return devlink_rate; } return ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { const char *rate_node_name; size_t len; if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return ERR_PTR(-EINVAL); rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); len = strlen(rate_node_name); /* Name cannot be empty or decimal number */ if (!len || strspn(rate_node_name, "0123456789") == len) return ERR_PTR(-EINVAL); return devlink_rate_node_get_by_name(devlink, rate_node_name); } static struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } static struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; if (attrs[DEVLINK_ATTR_PORT_INDEX]) return devlink_rate_leaf_get_from_info(devlink, info); else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return devlink_rate_node_get_from_info(devlink, info); else return ERR_PTR(-EINVAL); } static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) { struct nlattr *nla_tc_bw; int i; for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS); if (!nla_tc_bw) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) || nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i])) goto nla_put_failure; nla_nest_end(msg, nla_tc_bw); } return 0; nla_put_failure: nla_nest_cancel(msg, nla_tc_bw); return -EMSGSIZE; } static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_rate->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) goto nla_put_failure; if (devlink_rate_is_leaf(devlink_rate)) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_rate->devlink_port->index)) goto nla_put_failure; } else if (devlink_rate_is_node(devlink_rate)) { if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, devlink_rate->name)) goto nla_put_failure; } if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_SHARE, devlink_rate->tx_share)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_MAX, devlink_rate->tx_max)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, devlink_rate->tx_priority)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, devlink_rate->tx_weight)) goto nla_put_failure; if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) goto nla_put_failure; if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_rates_notify_register(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); } void devlink_rates_notify_unregister(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); } static int devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; int idx = 0; int err = 0; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; u32 id = NETLINK_CB(cb->skb).portid; if (idx < state->idx) { idx++; continue; } err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, cb->nlh->nlmsg_seq, flags, NULL); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); } int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; struct sk_buff *msg; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static bool devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, struct devlink_rate *parent) { while (parent) { if (parent == devlink_rate) return true; parent = parent->parent; } return false; } static int devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, struct genl_info *info, struct nlattr *nla_parent) { struct devlink *devlink = devlink_rate->devlink; const char *parent_name = nla_data(nla_parent); const struct devlink_ops *ops = devlink->ops; size_t len = strlen(parent_name); struct devlink_rate *parent; int err = -EOPNOTSUPP; parent = devlink_rate->parent; if (parent && !len) { if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); if (err) return err; refcount_dec(&parent->refcnt); devlink_rate->parent = NULL; } else if (len) { parent = devlink_rate_node_get_by_name(devlink, parent_name); if (IS_ERR(parent)) return -ENODEV; if (parent == devlink_rate) { NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed"); return -EINVAL; } if (devlink_rate_is_node(devlink_rate) && devlink_rate_is_parent_node(devlink_rate, parent->parent)) { NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node."); return -EEXIST; } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); if (err) return err; if (devlink_rate->parent) /* we're reassigning to other parent in this case */ refcount_dec(&devlink_rate->parent->refcnt); refcount_inc(&parent->refcnt); devlink_rate->parent = parent; } return 0; } static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, unsigned long *bitmap, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1]; u8 tc_index; int err; err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest, devlink_dl_rate_tc_bws_nl_policy, extack); if (err) return err; if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_RATE_TC_ATTR_INDEX); return -EINVAL; } tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]); if (!tb[DEVLINK_RATE_TC_ATTR_BW]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_RATE_TC_ATTR_BW); return -EINVAL; } if (test_and_set_bit(tc_index, bitmap)) { NL_SET_ERR_MSG_FMT(extack, "Duplicate traffic class index specified (%u)", tc_index); return -EINVAL; } tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]); return 0; } static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate, struct genl_info *info) { DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {}; struct devlink *devlink = devlink_rate->devlink; const struct devlink_ops *ops = devlink->ops; u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {}; int rem, err = -EOPNOTSUPP, i; struct nlattr *attr; nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr, GENL_HDRLEN, rem) { err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, info->extack); if (err) return err; } for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { if (!test_bit(i, bitmap)) { NL_SET_ERR_MSG_FMT(info->extack, "Bandwidth values must be specified for all %u traffic classes", DEVLINK_RATE_TCS_MAX); return -EINVAL; } } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw, info->extack); if (err) return err; memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); return 0; } static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) { struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; u32 priority; u32 weight; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_share = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_max = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); if (err) return err; devlink_rate->tx_priority = priority; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); if (err) return err; devlink_rate->tx_weight = weight; } nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, nla_parent); if (err) return err; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) { err = devlink_nl_rate_tc_bw_set(devlink_rate, info); if (err) return err; } return 0; } static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, struct genl_info *info, enum devlink_rate_type type) { struct nlattr **attrs = info->attrs; if (type == DEVLINK_RATE_TYPE_LEAF) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_leaf_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_leaf_tc_bw_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TC_BWS], "TC bandwidth set isn't supported for the leafs"); return false; } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_node_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_node_tc_bw_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TC_BWS], "TC bandwidth set isn't supported for the nodes"); return false; } } else { WARN(1, "Unknown type of rate object"); return false; } return true; } int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; const struct devlink_ops *ops; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); ops = devlink->ops; if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) return -EOPNOTSUPP; err = devlink_nl_rate_set(devlink_rate, ops, info); if (!err) devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return err; } int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; const struct devlink_ops *ops; int err; ops = devlink->ops; if (!ops || !ops->rate_node_new || !ops->rate_node_del) { NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported"); return -EOPNOTSUPP; } if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) return -EOPNOTSUPP; rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); if (!IS_ERR(rate_node)) return -EEXIST; else if (rate_node == ERR_PTR(-EINVAL)) return -EINVAL; rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return -ENOMEM; rate_node->devlink = devlink; rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); if (!rate_node->name) { err = -ENOMEM; goto err_strdup; } err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); if (err) goto err_node_new; err = devlink_nl_rate_set(rate_node, ops, info); if (err) goto err_rate_set; refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return 0; err_rate_set: ops->rate_node_del(rate_node, rate_node->priv, info->extack); err_node_new: kfree(rate_node->name); err_strdup: kfree(rate_node); return err; } int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; int err; rate_node = devlink_rate_node_get_from_info(devlink, info); if (IS_ERR(rate_node)) return PTR_ERR(rate_node); if (refcount_read(&rate_node->refcnt) > 1) { NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); return -EBUSY; } devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); err = devlink->ops->rate_node_del(rate_node, rate_node->priv, info->extack); if (rate_node->parent) refcount_dec(&rate_node->parent->refcnt); list_del(&rate_node->list); kfree(rate_node->name); kfree(rate_node); return err; } int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { NL_SET_ERR_MSG(extack, "Rate node(s) exists."); return -EBUSY; } return 0; } /** * devl_rate_node_create - create devlink rate node * @devlink: devlink instance * @priv: driver private data * @node_name: name of the resulting node * @parent: parent devlink_rate struct * * Create devlink rate object of type node */ struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent) { struct devlink_rate *rate_node; rate_node = devlink_rate_node_get_by_name(devlink, node_name); if (!IS_ERR(rate_node)) return ERR_PTR(-EEXIST); rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return ERR_PTR(-ENOMEM); if (parent) { rate_node->parent = parent; refcount_inc(&rate_node->parent->refcnt); } rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->devlink = devlink; rate_node->priv = priv; rate_node->name = kstrdup(node_name, GFP_KERNEL); if (!rate_node->name) { kfree(rate_node); return ERR_PTR(-ENOMEM); } refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return rate_node; } EXPORT_SYMBOL_GPL(devl_rate_node_create); /** * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on * @priv: driver private data * @parent: parent devlink_rate struct * * Create devlink rate object of type leaf on provided @devlink_port. */ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, struct devlink_rate *parent) { struct devlink *devlink = devlink_port->devlink; struct devlink_rate *devlink_rate; devl_assert_locked(devlink_port->devlink); if (WARN_ON(devlink_port->devlink_rate)) return -EBUSY; devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); if (!devlink_rate) return -ENOMEM; if (parent) { devlink_rate->parent = parent; refcount_inc(&devlink_rate->parent->refcnt); } devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; devlink_rate->devlink = devlink; devlink_rate->devlink_port = devlink_port; devlink_rate->priv = priv; list_add_tail(&devlink_rate->list, &devlink->rate_list); devlink_port->devlink_rate = devlink_rate; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_rate_leaf_create); /** * devl_rate_leaf_destroy - destroy devlink rate leaf * * @devlink_port: devlink port linked to the rate object * * Destroy the devlink rate object of type leaf on provided @devlink_port. */ void devl_rate_leaf_destroy(struct devlink_port *devlink_port) { struct devlink_rate *devlink_rate = devlink_port->devlink_rate; devl_assert_locked(devlink_port->devlink); if (!devlink_rate) return; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); if (devlink_rate->parent) refcount_dec(&devlink_rate->parent->refcnt); list_del(&devlink_rate->list); devlink_port->devlink_rate = NULL; kfree(devlink_rate); } EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); /** * devl_rate_nodes_destroy - destroy all devlink rate nodes on device * @devlink: devlink instance * * Unset parent for all rate objects and destroy all rate nodes * on specified device. */ void devl_rate_nodes_destroy(struct devlink *devlink) { static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; devl_assert_locked(devlink); list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (!devlink_rate->parent) continue; refcount_dec(&devlink_rate->parent->refcnt); if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); list_del(&devlink_rate->list); kfree(devlink_rate->name); kfree(devlink_rate); } } } EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
2 2 2 6 3 3 6 6 8 8 7 1 6 8 7 8 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Monitoring SMC transport protocol sockets * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/init.h> #include <linux/sock_diag.h> #include <linux/inet_diag.h> #include <linux/smc_diag.h> #include <net/netlink.h> #include <net/smc.h> #include "smc.h" #include "smc_core.h" #include "smc_ism.h" struct smc_diag_dump_ctx { int pos[2]; }; static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) { return (struct smc_diag_dump_ctx *)cb->ctx; } static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); memset(r, 0, sizeof(*r)); r->diag_family = sk->sk_family; sock_diag_save_cookie(sk, r->id.idiag_cookie); if (!smc->clcsock) return; r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); r->id.idiag_dport = smc->clcsock->sk->sk_dport; r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; if (sk->sk_protocol == SMCPROTO_SMC) { r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_protocol == SMCPROTO_SMC6) { memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr, sizeof(smc->clcsock->sk->sk_v6_rcv_saddr)); memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr, sizeof(smc->clcsock->sk->sk_v6_daddr)); #endif } } static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct smc_diag_msg *r, struct user_namespace *user_ns) { if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) return 1; r->diag_uid = from_kuid_munged(user_ns, sk_uid(sk)); r->diag_inode = sock_i_ino(sk); return 0; } static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct smc_diag_req *req, struct nlattr *bc) { struct smc_sock *smc = smc_sk(sk); struct smc_diag_fallback fallback; struct user_namespace *user_ns; struct smc_diag_msg *r; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); smc_diag_msg_common_fill(r, sk); r->diag_state = sk->sk_state; if (smc->use_fallback) r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) r->diag_mode = SMC_DIAG_MODE_SMCD; else r->diag_mode = SMC_DIAG_MODE_SMCR; user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) goto errout; fallback.reason = smc->fallback_rsn; fallback.peer_diagnosis = smc->peer_diagnosis; if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0) goto errout; if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.alert_token_local) { struct smc_connection *conn = &smc->conn; struct smc_diag_conninfo cinfo = { .token = conn->alert_token_local, .sndbuf_size = conn->sndbuf_desc ? conn->sndbuf_desc->len : 0, .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0, .peer_rmbe_size = conn->peer_rmbe_size, .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, .rx_prod.count = conn->local_rx_ctrl.prod.count, .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap, .rx_cons.count = conn->local_rx_ctrl.cons.count, .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap, .tx_prod.count = conn->local_tx_ctrl.prod.count, .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap, .tx_cons.count = conn->local_tx_ctrl.cons.count, .tx_prod_flags = *(u8 *)&conn->local_tx_ctrl.prod_flags, .tx_conn_state_flags = *(u8 *)&conn->local_tx_ctrl.conn_state_flags, .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags, .rx_conn_state_flags = *(u8 *)&conn->local_rx_ctrl.conn_state_flags, .tx_prep.wrap = conn->tx_curs_prep.wrap, .tx_prep.count = conn->tx_curs_prep.count, .tx_sent.wrap = conn->tx_curs_sent.wrap, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) goto errout; } if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_link *link = smc->conn.lnk; struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, .lnk[0].ibport = link->ibport, .lnk[0].link_id = link->link_id, }; memcpy(linfo.lnk[0].ibname, link->smcibdev->ibdev->name, sizeof(link->smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; struct smcd_dev *smcd = conn->lgr->smcd; struct smcd_gid smcd_gid; memset(&dinfo, 0, sizeof(dinfo)); dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid.gid; dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; smcd->ops->get_local_gid(smcd, &smcd_gid); dinfo.my_gid = smcd_gid.gid; dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; dinfo.peer_token = conn->peer_token; if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0) goto errout; } nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, struct netlink_callback *cb, int p_type) { struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb); struct net *net = sock_net(skb->sk); int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; int rc = 0, num = 0; struct sock *sk; read_lock(&prot->h.smc_hash->lock); head = &prot->h.smc_hash->ht; if (hlist_empty(head)) goto out; sk_for_each(sk, head) { if (!net_eq(sock_net(sk), net)) continue; if (num < snum) goto next; rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); if (rc < 0) goto out; next: num++; } out: read_unlock(&prot->h.smc_hash->lock); cb_ctx->pos[p_type] = num; return rc; } static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { int rc = 0; rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC); if (!rc) smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6); return skb->len; } static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { struct net *net = sock_net(skb->sk); if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { { struct netlink_dump_control c = { .dump = smc_diag_dump, .min_dump_alloc = SKB_WITH_OVERHEAD(32768), }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } } return 0; } static const struct sock_diag_handler smc_diag_handler = { .owner = THIS_MODULE, .family = AF_SMC, .dump = smc_diag_handler_dump, }; static int __init smc_diag_init(void) { return sock_diag_register(&smc_diag_handler); } static void __exit smc_diag_exit(void) { sock_diag_unregister(&smc_diag_handler); } module_init(smc_diag_init); module_exit(smc_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SMC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME);
16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfserl.h> #define container_obj(layr) ((struct cfserl *) layr) #define CFSERL_STX 0x02 #define SERIAL_MINIUM_PACKET_SIZE 4 #define SERIAL_MAX_FRAMESIZE 4096 struct cfserl { struct cflayer layer; struct cfpkt *incomplete_frm; /* Protects parallel processing of incoming packets */ spinlock_t sync; bool usestx; }; static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); void cfserl_release(struct cflayer *layer) { kfree(layer); } struct cflayer *cfserl_create(int instance, bool use_stx) { struct cfserl *this = kzalloc(sizeof(struct cfserl), GFP_ATOMIC); if (!this) return NULL; caif_assert(offsetof(struct cfserl, layer) == 0); this->layer.receive = cfserl_receive; this->layer.transmit = cfserl_transmit; this->layer.ctrlcmd = cfserl_ctrlcmd; this->usestx = use_stx; spin_lock_init(&this->sync); snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "ser1"); return &this->layer; } static int cfserl_receive(struct cflayer *l, struct cfpkt *newpkt) { struct cfserl *layr = container_obj(l); u16 pkt_len; struct cfpkt *pkt = NULL; struct cfpkt *tail_pkt = NULL; u8 tmp8; u16 tmp; u8 stx = CFSERL_STX; int ret; u16 expectlen = 0; caif_assert(newpkt != NULL); spin_lock(&layr->sync); if (layr->incomplete_frm != NULL) { layr->incomplete_frm = cfpkt_append(layr->incomplete_frm, newpkt, expectlen); pkt = layr->incomplete_frm; if (pkt == NULL) { spin_unlock(&layr->sync); return -ENOMEM; } } else { pkt = newpkt; } layr->incomplete_frm = NULL; do { /* Search for STX at start of pkt if STX is used */ if (layr->usestx) { cfpkt_extr_head(pkt, &tmp8, 1); if (tmp8 != CFSERL_STX) { while (cfpkt_more(pkt) && tmp8 != CFSERL_STX) { cfpkt_extr_head(pkt, &tmp8, 1); } if (!cfpkt_more(pkt)) { cfpkt_destroy(pkt); layr->incomplete_frm = NULL; spin_unlock(&layr->sync); return -EPROTO; } } } pkt_len = cfpkt_getlen(pkt); /* * pkt_len is the accumulated length of the packet data * we have received so far. * Exit if frame doesn't hold length. */ if (pkt_len < 2) { if (layr->usestx) cfpkt_add_head(pkt, &stx, 1); layr->incomplete_frm = pkt; spin_unlock(&layr->sync); return 0; } /* * Find length of frame. * expectlen is the length we need for a full frame. */ cfpkt_peek_head(pkt, &tmp, 2); expectlen = le16_to_cpu(tmp) + 2; /* * Frame error handling */ if (expectlen < SERIAL_MINIUM_PACKET_SIZE || expectlen > SERIAL_MAX_FRAMESIZE) { if (!layr->usestx) { if (pkt != NULL) cfpkt_destroy(pkt); layr->incomplete_frm = NULL; spin_unlock(&layr->sync); return -EPROTO; } continue; } if (pkt_len < expectlen) { /* Too little received data */ if (layr->usestx) cfpkt_add_head(pkt, &stx, 1); layr->incomplete_frm = pkt; spin_unlock(&layr->sync); return 0; } /* * Enough data for at least one frame. * Split the frame, if too long */ if (pkt_len > expectlen) tail_pkt = cfpkt_split(pkt, expectlen); else tail_pkt = NULL; /* Send the first part of packet upwards.*/ spin_unlock(&layr->sync); ret = layr->layer.up->receive(layr->layer.up, pkt); spin_lock(&layr->sync); if (ret == -EILSEQ) { if (layr->usestx) { if (tail_pkt != NULL) pkt = cfpkt_append(pkt, tail_pkt, 0); /* Start search for next STX if frame failed */ continue; } else { cfpkt_destroy(pkt); pkt = NULL; } } pkt = tail_pkt; } while (pkt != NULL); spin_unlock(&layr->sync); return 0; } static int cfserl_transmit(struct cflayer *layer, struct cfpkt *newpkt) { struct cfserl *layr = container_obj(layer); u8 tmp8 = CFSERL_STX; if (layr->usestx) cfpkt_add_head(newpkt, &tmp8, 1); return layer->dn->transmit(layer->dn, newpkt); } static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { layr->up->ctrlcmd(layr->up, ctrl, phyid); }
295 8 56 129 15 509 9 216 216 191 48 48 1 1 9 9 9 13 130 373 65 91 9 2 2 2 2 352 20 605 138 54 464 28 434 287 287 260 123 67 1 8 2 201 269 35 125 34 441 60 431 1 1 401 468 404 66 599 253 349 10 346 309 154 88 37 53 23 28 25 42 36 33 4 39 4 30 400 271 465 205 204 510 511 510 511 511 510 507 511 510 510 509 19 19 2809 2809 205 228 133 27 39 57 15 98 7 38 51 51 305 138 114 114 16 205 205 113 89 20 13 85 85 84 34 35 13 2 97 37 10 1 9 5 5 13 1 6 1 5 10 66 1 1 1 1 1 1 58 1 16 17 11 5 6 10 3 9 6 2 4 196 1 1 58 63 30 30 127 149 151 3 118 67 47 71 13 71 20 1 45 45 3 31 32 31 18 11 7 5 26 52 51 2 35 20 35 11 8 2 1 4 35 10 34 6 1 97 32 1 1 1 30 19 4 1 3 74 1 2 1 1 1 2 67 60 60 5 37 22 7 31 5 23 6 3 19 1 17 2 32 8 1 7 157 1 152 41 141 87 83 30 3 2 2 1 128 129 19 20 90 73 6 39 26 91 216 216 19 213 26 22 3 2 2 9 9 9 9 9 9 4 4 4 4 4 4 4 4 4 4 4 2 4 4 2 1 1 1 1 2 5 4 1 9 5 1 15 15 6 15 15 4 1 4 5 2 3 3 59 2 2 52 27 38 30 3 19 17 78 46 1 41 32 33 26 1 20 17 162 2 1 160 106 56 6 50 49 15 26 20 3 43 1 42 2 36 28 28 68 3 1 4 60 2 2 51 20 71 3 1 1 1 67 33 7 10 6 5 21 8 6 18 12 6 1 1 57 15 1 1 13 4 1 4 6 27 1 24 1 1 1 23 16 16 15 11 12 4 3 2 2 2 2 9 66 55 2 33 22 4 23 43 39 2 17 43 3 33 25 23 32 16 11 28 1 24 8 1 2 17 602 604 320 605 169 465 208 103 78 25 108 108 14 322 322 262 101 38 209 404 268 407 405 121 258 121 612 1 6 611 407 616 2 5 617 634 632 635 4 2 1 4 625 712 32 4 670 7 2 671 19 5 8 635 6 214 215 216 108 215 215 214 217 216 217 218 84 2 10 84 84 213 711 2 23 712 89 8 399 2 216 11 205 498 23 509 109 109 4 4 4 4 4 4 3 1 5 5 736 739 532 205 200 1 717 587 128 148 21 21 20 4 7 5 10 9 7 7 1 1 23 127 3 3 6 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018 Facebook */ #include <uapi/linux/btf.h> #include <uapi/linux/bpf.h> #include <uapi/linux/bpf_perf_event.h> #include <uapi/linux/types.h> #include <linux/seq_file.h> #include <linux/compiler.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/anon_inodes.h> #include <linux/file.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/idr.h> #include <linux/sort.h> #include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/bpf.h> #include <linux/bpf_lsm.h> #include <linux/skmsg.h> #include <linux/perf_event.h> #include <linux/bsearch.h> #include <linux/kobject.h> #include <linux/sysfs.h> #include <linux/overflow.h> #include <net/netfilter/nf_bpf_link.h> #include <net/sock.h> #include <net/xdp.h> #include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus * on the C programming language which the modern BPF is primary * using. * * ELF Section: * ~~~~~~~~~~~ * The BTF data is stored under the ".BTF" ELF section * * struct btf_type: * ~~~~~~~~~~~~~~~ * Each 'struct btf_type' object describes a C data type. * Depending on the type it is describing, a 'struct btf_type' * object may be followed by more data. F.e. * To describe an array, 'struct btf_type' is followed by * 'struct btf_array'. * * 'struct btf_type' and any extra data following it are * 4 bytes aligned. * * Type section: * ~~~~~~~~~~~~~ * The BTF type section contains a list of 'struct btf_type' objects. * Each one describes a C type. Recall from the above section * that a 'struct btf_type' object could be immediately followed by extra * data in order to describe some particular C types. * * type_id: * ~~~~~~~ * Each btf_type object is identified by a type_id. The type_id * is implicitly implied by the location of the btf_type object in * the BTF type section. The first one has type_id 1. The second * one has type_id 2...etc. Hence, an earlier btf_type has * a smaller type_id. * * A btf_type object may refer to another btf_type object by using * type_id (i.e. the "type" in the "struct btf_type"). * * NOTE that we cannot assume any reference-order. * A btf_type object can refer to an earlier btf_type object * but it can also refer to a later btf_type object. * * For example, to describe "const void *". A btf_type * object describing "const" may refer to another btf_type * object describing "void *". This type-reference is done * by specifying type_id: * * [1] CONST (anon) type_id=2 * [2] PTR (anon) type_id=0 * * The above is the btf_verifier debug log: * - Each line started with "[?]" is a btf_type object * - [?] is the type_id of the btf_type object. * - CONST/PTR is the BTF_KIND_XXX * - "(anon)" is the name of the type. It just * happens that CONST and PTR has no name. * - type_id=XXX is the 'u32 type' in btf_type * * NOTE: "void" has type_id 0 * * String section: * ~~~~~~~~~~~~~~ * The BTF string section contains the names used by the type section. * Each string is referred by an "offset" from the beginning of the * string section. * * Each string is '\0' terminated. * * The first character in the string section must be '\0' * which is used to mean 'anonymous'. Some btf_type may not * have a name. */ /* BTF verification: * * To verify BTF data, two passes are needed. * * Pass #1 * ~~~~~~~ * The first pass is to collect all btf_type objects to * an array: "btf->types". * * Depending on the C type that a btf_type is describing, * a btf_type may be followed by extra data. We don't know * how many btf_type is there, and more importantly we don't * know where each btf_type is located in the type section. * * Without knowing the location of each type_id, most verifications * cannot be done. e.g. an earlier btf_type may refer to a later * btf_type (recall the "const void *" above), so we cannot * check this type-reference in the first pass. * * In the first pass, it still does some verifications (e.g. * checking the name is a valid offset to the string section). * * Pass #2 * ~~~~~~~ * The main focus is to resolve a btf_type that is referring * to another type. * * We have to ensure the referring type: * 1) does exist in the BTF (i.e. in btf->types[]) * 2) does not cause a loop: * struct A { * struct B b; * }; * * struct B { * struct A a; * }; * * btf_type_needs_resolve() decides if a btf_type needs * to be resolved. * * The needs_resolve type implements the "resolve()" ops which * essentially does a DFS and detects backedge. * * During resolve (or DFS), different C types have different * "RESOLVED" conditions. * * When resolving a BTF_KIND_STRUCT, we need to resolve all its * members because a member is always referring to another * type. A struct's member can be treated as "RESOLVED" if * it is referring to a BTF_KIND_PTR. Otherwise, the * following valid C struct would be rejected: * * struct A { * int m; * struct A *a; * }; * * When resolving a BTF_KIND_PTR, it needs to keep resolving if * it is referring to another BTF_KIND_PTR. Otherwise, we cannot * detect a pointer loop, e.g.: * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR + * ^ | * +-----------------------------------------+ * */ #define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2) #define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) #define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) #define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) #define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) /* 16MB for 64k structs and each has 16 members and * a few MB spaces for the string section. * The hard limit is S32_MAX. */ #define BTF_MAX_SIZE (16 * 1024 * 1024) #define for_each_member_from(i, from, struct_type, member) \ for (i = from, member = btf_type_member(struct_type) + from; \ i < btf_type_vlen(struct_type); \ i++, member++) #define for_each_vsi_from(i, from, struct_type, member) \ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ i < btf_type_vlen(struct_type); \ i++, member++) DEFINE_IDR(btf_idr); DEFINE_SPINLOCK(btf_idr_lock); enum btf_kfunc_hook { BTF_KFUNC_HOOK_COMMON, BTF_KFUNC_HOOK_XDP, BTF_KFUNC_HOOK_TC, BTF_KFUNC_HOOK_STRUCT_OPS, BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, BTF_KFUNC_HOOK_FMODRET, BTF_KFUNC_HOOK_CGROUP, BTF_KFUNC_HOOK_SCHED_ACT, BTF_KFUNC_HOOK_SK_SKB, BTF_KFUNC_HOOK_SOCKET_FILTER, BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_NETFILTER, BTF_KFUNC_HOOK_KPROBE, BTF_KFUNC_HOOK_MAX, }; enum { BTF_KFUNC_SET_MAX_CNT = 256, BTF_DTOR_KFUNC_MAX_CNT = 256, BTF_KFUNC_FILTER_MAX_CNT = 16, }; struct btf_kfunc_hook_filter { btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT]; u32 nr_filters; }; struct btf_kfunc_set_tab { struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX]; struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX]; }; struct btf_id_dtor_kfunc_tab { u32 cnt; struct btf_id_dtor_kfunc dtors[]; }; struct btf_struct_ops_tab { u32 cnt; u32 capacity; struct bpf_struct_ops_desc ops[]; }; struct btf { void *data; struct btf_type **types; u32 *resolved_ids; u32 *resolved_sizes; const char *strings; void *nohdr_data; struct btf_header hdr; u32 nr_types; /* includes VOID for base BTF */ u32 types_size; u32 data_size; refcount_t refcnt; u32 id; struct rcu_head rcu; struct btf_kfunc_set_tab *kfunc_set_tab; struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; struct btf_struct_metas *struct_meta_tab; struct btf_struct_ops_tab *struct_ops_tab; /* split BTF support */ struct btf *base_btf; u32 start_id; /* first type ID in this BTF (0 for base BTF) */ u32 start_str_off; /* first string offset (0 for base BTF) */ char name[MODULE_NAME_LEN]; bool kernel_btf; __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */ }; enum verifier_phase { CHECK_META, CHECK_TYPE, }; struct resolve_vertex { const struct btf_type *t; u32 type_id; u16 next_member; }; enum visit_state { NOT_VISITED, VISITED, RESOLVED, }; enum resolve_mode { RESOLVE_TBD, /* To Be Determined */ RESOLVE_PTR, /* Resolving for Pointer */ RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union * or array */ }; #define MAX_RESOLVE_DEPTH 32 struct btf_sec_info { u32 off; u32 len; }; struct btf_verifier_env { struct btf *btf; u8 *visit_states; struct resolve_vertex stack[MAX_RESOLVE_DEPTH]; struct bpf_verifier_log log; u32 log_type_id; u32 top_stack; enum verifier_phase phase; enum resolve_mode resolve_mode; }; static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_UNKN] = "UNKNOWN", [BTF_KIND_INT] = "INT", [BTF_KIND_PTR] = "PTR", [BTF_KIND_ARRAY] = "ARRAY", [BTF_KIND_STRUCT] = "STRUCT", [BTF_KIND_UNION] = "UNION", [BTF_KIND_ENUM] = "ENUM", [BTF_KIND_FWD] = "FWD", [BTF_KIND_TYPEDEF] = "TYPEDEF", [BTF_KIND_VOLATILE] = "VOLATILE", [BTF_KIND_CONST] = "CONST", [BTF_KIND_RESTRICT] = "RESTRICT", [BTF_KIND_FUNC] = "FUNC", [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", [BTF_KIND_TYPE_TAG] = "TYPE_TAG", [BTF_KIND_ENUM64] = "ENUM64", }; const char *btf_type_str(const struct btf_type *t) { return btf_kind_str[BTF_INFO_KIND(t->info)]; } /* Chunk size we use in safe copy of data to be shown. */ #define BTF_SHOW_OBJ_SAFE_SIZE 32 /* * This is the maximum size of a base type value (equivalent to a * 128-bit int); if we are at the end of our safe buffer and have * less than 16 bytes space we can't be assured of being able * to copy the next type safely, so in such cases we will initiate * a new copy. */ #define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16 /* Type name size */ #define BTF_SHOW_NAME_SIZE 80 /* * The suffix of a type that indicates it cannot alias another type when * comparing BTF IDs for kfunc invocations. */ #define NOCAST_ALIAS_SUFFIX "___init" /* * Common data to all BTF show operations. Private show functions can add * their own data to a structure containing a struct btf_show and consult it * in the show callback. See btf_type_show() below. * * One challenge with showing nested data is we want to skip 0-valued * data, but in order to figure out whether a nested object is all zeros * we need to walk through it. As a result, we need to make two passes * when handling structs, unions and arrays; the first path simply looks * for nonzero data, while the second actually does the display. The first * pass is signalled by show->state.depth_check being set, and if we * encounter a non-zero value we set show->state.depth_to_show to * the depth at which we encountered it. When we have completed the * first pass, we will know if anything needs to be displayed if * depth_to_show > depth. See btf_[struct,array]_show() for the * implementation of this. * * Another problem is we want to ensure the data for display is safe to * access. To support this, the anonymous "struct {} obj" tracks the data * object and our safe copy of it. We copy portions of the data needed * to the object "copy" buffer, but because its size is limited to * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we * traverse larger objects for display. * * The various data type show functions all start with a call to * btf_show_start_type() which returns a pointer to the safe copy * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the * raw data itself). btf_show_obj_safe() is responsible for * using copy_from_kernel_nofault() to update the safe data if necessary * as we traverse the object's data. skbuff-like semantics are * used: * * - obj.head points to the start of the toplevel object for display * - obj.size is the size of the toplevel object * - obj.data points to the current point in the original data at * which our safe data starts. obj.data will advance as we copy * portions of the data. * * In most cases a single copy will suffice, but larger data structures * such as "struct task_struct" will require many copies. The logic in * btf_show_obj_safe() handles the logic that determines if a new * copy_from_kernel_nofault() is needed. */ struct btf_show { u64 flags; void *target; /* target of show operation (seq file, buffer) */ __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args); const struct btf *btf; /* below are used during iteration */ struct { u8 depth; u8 depth_to_show; u8 depth_check; u8 array_member:1, array_terminated:1; u16 array_encoding; u32 type_id; int status; /* non-zero for error */ const struct btf_type *type; const struct btf_member *member; char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */ } state; struct { u32 size; void *head; void *data; u8 safe[BTF_SHOW_OBJ_SAFE_SIZE]; } obj; }; struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left); int (*resolve)(struct btf_verifier_env *env, const struct resolve_vertex *v); int (*check_member)(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type); int (*check_kflag_member)(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); void (*show)(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offsets, struct btf_show *show); }; static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; static struct btf_type btf_void; static int btf_resolve(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id); static int btf_func_check(struct btf_verifier_env *env, const struct btf_type *t); static bool btf_type_is_modifier(const struct btf_type *t) { /* Some of them is not strictly a C modifier * but they are grouped into the same bucket * for BTF concern: * A type (t) that refers to another * type through t->type AND its size cannot * be determined without following the t->type. * * ptr does not fall into this bucket * because its size is always sizeof(void *). */ switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_TYPEDEF: case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: case BTF_KIND_TYPE_TAG: return true; } return false; } bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; } static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } static bool btf_type_is_decl_tag(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; } static bool btf_type_nosize(const struct btf_type *t) { return btf_type_is_void(t) || btf_type_is_fwd(t) || btf_type_is_func(t) || btf_type_is_func_proto(t) || btf_type_is_decl_tag(t); } static bool btf_type_nosize_or_null(const struct btf_type *t) { return !t || btf_type_nosize(t); } static bool btf_type_is_decl_tag_target(const struct btf_type *t) { return btf_type_is_func(t) || btf_type_is_struct(t) || btf_type_is_var(t) || btf_type_is_typedef(t); } bool btf_is_vmlinux(const struct btf *btf) { return btf->kernel_btf && !btf->base_btf; } u32 btf_nr_types(const struct btf *btf) { u32 total = 0; while (btf) { total += btf->nr_types; btf = btf->base_btf; } return total; } s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) { const struct btf_type *t; const char *tname; u32 i, total; total = btf_nr_types(btf); for (i = 1; i < total; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) continue; tname = btf_name_by_offset(btf, t->name_off); if (!strcmp(tname, name)) return i; } return -ENOENT; } s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) { struct btf *btf; s32 ret; int id; btf = bpf_get_btf_vmlinux(); if (IS_ERR(btf)) return PTR_ERR(btf); if (!btf) return -EINVAL; ret = btf_find_by_name_kind(btf, name, kind); /* ret is never zero, since btf_find_by_name_kind returns * positive btf_id or negative error. */ if (ret > 0) { btf_get(btf); *btf_p = btf; return ret; } /* If name is not found in vmlinux's BTF then search in module's BTFs */ spin_lock_bh(&btf_idr_lock); idr_for_each_entry(&btf_idr, btf, id) { if (!btf_is_module(btf)) continue; /* linear search could be slow hence unlock/lock * the IDR to avoiding holding it for too long */ btf_get(btf); spin_unlock_bh(&btf_idr_lock); ret = btf_find_by_name_kind(btf, name, kind); if (ret > 0) { *btf_p = btf; return ret; } btf_put(btf); spin_lock_bh(&btf_idr_lock); } spin_unlock_bh(&btf_idr_lock); return ret; } EXPORT_SYMBOL_GPL(bpf_find_btf_id); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) { const struct btf_type *t = btf_type_by_id(btf, id); while (btf_type_is_modifier(t)) { id = t->type; t = btf_type_by_id(btf, t->type); } if (res_id) *res_id = id; return t; } const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, u32 id, u32 *res_id) { const struct btf_type *t; t = btf_type_skip_modifiers(btf, id, NULL); if (!btf_type_is_ptr(t)) return NULL; return btf_type_skip_modifiers(btf, t->type, res_id); } const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, u32 id, u32 *res_id) { const struct btf_type *ptype; ptype = btf_type_resolve_ptr(btf, id, res_id); if (ptype && btf_type_is_func_proto(ptype)) return ptype; return NULL; } /* Types that act only as a source, not sink or intermediate * type when resolving. */ static bool btf_type_is_resolve_source_only(const struct btf_type *t) { return btf_type_is_var(t) || btf_type_is_decl_tag(t) || btf_type_is_datasec(t); } /* What types need to be resolved? * * btf_type_is_modifier() is an obvious one. * * btf_type_is_struct() because its member refers to * another type (through member->type). * * btf_type_is_var() because the variable refers to * another type. btf_type_is_datasec() holds multiple * btf_type_is_var() types that need resolving. * * btf_type_is_array() because its element (array->type) * refers to another type. Array can be thought of a * special case of struct while array just has the same * member-type repeated by array->nelems of times. */ static bool btf_type_needs_resolve(const struct btf_type *t) { return btf_type_is_modifier(t) || btf_type_is_ptr(t) || btf_type_is_struct(t) || btf_type_is_array(t) || btf_type_is_var(t) || btf_type_is_func(t) || btf_type_is_decl_tag(t) || btf_type_is_datasec(t); } /* t->size can be used */ static bool btf_type_has_size(const struct btf_type *t) { switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_INT: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: case BTF_KIND_FLOAT: case BTF_KIND_ENUM64: return true; } return false; } static const char *btf_int_encoding_str(u8 encoding) { if (encoding == 0) return "(none)"; else if (encoding == BTF_INT_SIGNED) return "SIGNED"; else if (encoding == BTF_INT_CHAR) return "CHAR"; else if (encoding == BTF_INT_BOOL) return "BOOL"; else return "UNKN"; } static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); } static const struct btf_array *btf_type_array(const struct btf_type *t) { return (const struct btf_array *)(t + 1); } static const struct btf_enum *btf_type_enum(const struct btf_type *t) { return (const struct btf_enum *)(t + 1); } static const struct btf_var *btf_type_var(const struct btf_type *t) { return (const struct btf_var *)(t + 1); } static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t) { return (const struct btf_decl_tag *)(t + 1); } static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t) { return (const struct btf_enum64 *)(t + 1); } static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; } static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { if (!BTF_STR_OFFSET_VALID(offset)) return false; while (offset < btf->start_str_off) btf = btf->base_btf; offset -= btf->start_str_off; return offset < btf->hdr.str_len; } static bool __btf_name_char_ok(char c, bool first) { if ((first ? !isalpha(c) : !isalnum(c)) && c != '_' && c != '.') return false; return true; } const char *btf_str_by_offset(const struct btf *btf, u32 offset) { while (offset < btf->start_str_off) btf = btf->base_btf; offset -= btf->start_str_off; if (offset < btf->hdr.str_len) return &btf->strings[offset]; return NULL; } static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; if (!__btf_name_char_ok(*src, true)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { if (!__btf_name_char_ok(*src, false)) return false; src++; } return !*src; } /* Allow any printable character in DATASEC names */ static bool btf_name_valid_section(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; if (!*src) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; while (*src && src < src_limit) { if (!isprint(*src)) return false; src++; } return !*src; } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { const char *name; if (!offset) return "(anon)"; name = btf_str_by_offset(btf, offset); return name ?: "(invalid-name-offset)"; } const char *btf_name_by_offset(const struct btf *btf, u32 offset) { return btf_str_by_offset(btf, offset); } const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { while (type_id < btf->start_id) btf = btf->base_btf; type_id -= btf->start_id; if (type_id >= btf->nr_types) return NULL; return btf->types[type_id]; } EXPORT_SYMBOL_GPL(btf_type_by_id); /* * Check that the type @t is a regular int. This means that @t is not * a bit field and it has the same size as either of u8/u16/u32/u64 * or __int128. If @expected_size is not zero, then size of @t should * be the same. A caller should already have checked that the type @t * is an integer. */ static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size) { u32 int_data = btf_type_int(t); u8 nr_bits = BTF_INT_BITS(int_data); u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); return BITS_PER_BYTE_MASKED(nr_bits) == 0 && BTF_INT_OFFSET(int_data) == 0 && (nr_bytes <= 16 && is_power_of_2(nr_bytes)) && (expected_size == 0 || nr_bytes == expected_size); } static bool btf_type_int_is_regular(const struct btf_type *t) { return __btf_type_int_is_regular(t, 0); } bool btf_type_is_i32(const struct btf_type *t) { return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4); } bool btf_type_is_i64(const struct btf_type *t) { return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8); } bool btf_type_is_primitive(const struct btf_type *t) { return (btf_type_is_int(t) && btf_type_int_is_regular(t)) || btf_is_any_enum(t); } /* * Check that given struct member is a regular int with expected * offset and size. */ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size) { const struct btf_type *t; u32 id, int_data; u8 nr_bits; id = m->type; t = btf_type_id_size(btf, &id, NULL); if (!t || !btf_type_is_int(t)) return false; int_data = btf_type_int(t); nr_bits = BTF_INT_BITS(int_data); if (btf_type_kflag(s)) { u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset); u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset); /* if kflag set, int should be a regular int and * bit offset should be at byte boundary. */ return !bitfield_size && BITS_ROUNDUP_BYTES(bit_offset) == expected_offset && BITS_ROUNDUP_BYTES(nr_bits) == expected_size; } if (BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(m->offset) || BITS_ROUNDUP_BYTES(m->offset) != expected_offset || BITS_PER_BYTE_MASKED(nr_bits) || BITS_ROUNDUP_BYTES(nr_bits) != expected_size) return false; return true; } /* Similar to btf_type_skip_modifiers() but does not skip typedefs. */ static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, u32 id) { const struct btf_type *t = btf_type_by_id(btf, id); while (btf_type_is_modifier(t) && BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { t = btf_type_by_id(btf, t->type); } return t; } #define BTF_SHOW_MAX_ITER 10 #define BTF_KIND_BIT(kind) (1ULL << kind) /* * Populate show->state.name with type name information. * Format of type name is * * [.member_name = ] (type_name) */ static const char *btf_show_name(struct btf_show *show) { /* BTF_MAX_ITER array suffixes "[]" */ const char *array_suffixes = "[][][][][][][][][][]"; const char *array_suffix = &array_suffixes[strlen(array_suffixes)]; /* BTF_MAX_ITER pointer suffixes "*" */ const char *ptr_suffixes = "**********"; const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)]; const char *name = NULL, *prefix = "", *parens = ""; const struct btf_member *m = show->state.member; const struct btf_type *t; const struct btf_array *array; u32 id = show->state.type_id; const char *member = NULL; bool show_member = false; u64 kinds = 0; int i; show->state.name[0] = '\0'; /* * Don't show type name if we're showing an array member; * in that case we show the array type so don't need to repeat * ourselves for each member. */ if (show->state.array_member) return ""; /* Retrieve member name, if any. */ if (m) { member = btf_name_by_offset(show->btf, m->name_off); show_member = strlen(member) > 0; id = m->type; } /* * Start with type_id, as we have resolved the struct btf_type * * via btf_modifier_show() past the parent typedef to the child * struct, int etc it is defined as. In such cases, the type_id * still represents the starting type while the struct btf_type * * in our show->state points at the resolved type of the typedef. */ t = btf_type_by_id(show->btf, id); if (!t) return ""; /* * The goal here is to build up the right number of pointer and * array suffixes while ensuring the type name for a typedef * is represented. Along the way we accumulate a list of * BTF kinds we have encountered, since these will inform later * display; for example, pointer types will not require an * opening "{" for struct, we will just display the pointer value. * * We also want to accumulate the right number of pointer or array * indices in the format string while iterating until we get to * the typedef/pointee/array member target type. * * We start by pointing at the end of pointer and array suffix * strings; as we accumulate pointers and arrays we move the pointer * or array string backwards so it will show the expected number of * '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers * and/or arrays and typedefs are supported as a precaution. * * We also want to get typedef name while proceeding to resolve * type it points to so that we can add parentheses if it is a * "typedef struct" etc. */ for (i = 0; i < BTF_SHOW_MAX_ITER; i++) { switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_TYPEDEF: if (!name) name = btf_name_by_offset(show->btf, t->name_off); kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF); id = t->type; break; case BTF_KIND_ARRAY: kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY); parens = "["; if (!t) return ""; array = btf_type_array(t); if (array_suffix > array_suffixes) array_suffix -= 2; id = array->type; break; case BTF_KIND_PTR: kinds |= BTF_KIND_BIT(BTF_KIND_PTR); if (ptr_suffix > ptr_suffixes) ptr_suffix -= 1; id = t->type; break; default: id = 0; break; } if (!id) break; t = btf_type_skip_qualifiers(show->btf, id); } /* We may not be able to represent this type; bail to be safe */ if (i == BTF_SHOW_MAX_ITER) return ""; if (!name) name = btf_name_by_offset(show->btf, t->name_off); switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_STRUCT: case BTF_KIND_UNION: prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ? "struct" : "union"; /* if it's an array of struct/union, parens is already set */ if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY)))) parens = "{"; break; case BTF_KIND_ENUM: case BTF_KIND_ENUM64: prefix = "enum"; break; default: break; } /* pointer does not require parens */ if (kinds & BTF_KIND_BIT(BTF_KIND_PTR)) parens = ""; /* typedef does not require struct/union/enum prefix */ if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF)) prefix = ""; if (!name) name = ""; /* Even if we don't want type name info, we want parentheses etc */ if (show->flags & BTF_SHOW_NONAME) snprintf(show->state.name, sizeof(show->state.name), "%s", parens); else snprintf(show->state.name, sizeof(show->state.name), "%s%s%s(%s%s%s%s%s%s)%s", /* first 3 strings comprise ".member = " */ show_member ? "." : "", show_member ? member : "", show_member ? " = " : "", /* ...next is our prefix (struct, enum, etc) */ prefix, strlen(prefix) > 0 && strlen(name) > 0 ? " " : "", /* ...this is the type name itself */ name, /* ...suffixed by the appropriate '*', '[]' suffixes */ strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix, array_suffix, parens); return show->state.name; } static const char *__btf_show_indent(struct btf_show *show) { const char *indents = " "; const char *indent = &indents[strlen(indents)]; if ((indent - show->state.depth) >= indents) return indent - show->state.depth; return indents; } static const char *btf_show_indent(struct btf_show *show) { return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show); } static const char *btf_show_newline(struct btf_show *show) { return show->flags & BTF_SHOW_COMPACT ? "" : "\n"; } static const char *btf_show_delim(struct btf_show *show) { if (show->state.depth == 0) return ""; if ((show->flags & BTF_SHOW_COMPACT) && show->state.type && BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION) return "|"; return ","; } __printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...) { va_list args; if (!show->state.depth_check) { va_start(args, fmt); show->showfn(show, fmt, args); va_end(args); } } /* Macros are used here as btf_show_type_value[s]() prepends and appends * format specifiers to the format specifier passed in; these do the work of * adding indentation, delimiters etc while the caller simply has to specify * the type value(s) in the format specifier + value(s). */ #define btf_show_type_value(show, fmt, value) \ do { \ if ((value) != (__typeof__(value))0 || \ (show->flags & BTF_SHOW_ZERO) || \ show->state.depth == 0) { \ btf_show(show, "%s%s" fmt "%s%s", \ btf_show_indent(show), \ btf_show_name(show), \ value, btf_show_delim(show), \ btf_show_newline(show)); \ if (show->state.depth > show->state.depth_to_show) \ show->state.depth_to_show = show->state.depth; \ } \ } while (0) #define btf_show_type_values(show, fmt, ...) \ do { \ btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \ btf_show_name(show), \ __VA_ARGS__, btf_show_delim(show), \ btf_show_newline(show)); \ if (show->state.depth > show->state.depth_to_show) \ show->state.depth_to_show = show->state.depth; \ } while (0) /* How much is left to copy to safe buffer after @data? */ static int btf_show_obj_size_left(struct btf_show *show, void *data) { return show->obj.head + show->obj.size - data; } /* Is object pointed to by @data of @size already copied to our safe buffer? */ static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size) { return data >= show->obj.data && (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE); } /* * If object pointed to by @data of @size falls within our safe buffer, return * the equivalent pointer to the same safe data. Assumes * copy_from_kernel_nofault() has already happened and our safe buffer is * populated. */ static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size) { if (btf_show_obj_is_safe(show, data, size)) return show->obj.safe + (data - show->obj.data); return NULL; } /* * Return a safe-to-access version of data pointed to by @data. * We do this by copying the relevant amount of information * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault(). * * If BTF_SHOW_UNSAFE is specified, just return data as-is; no * safe copy is needed. * * Otherwise we need to determine if we have the required amount * of data (determined by the @data pointer and the size of the * largest base type we can encounter (represented by * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures * that we will be able to print some of the current object, * and if more is needed a copy will be triggered. * Some objects such as structs will not fit into the buffer; * in such cases additional copies when we iterate over their * members may be needed. * * btf_show_obj_safe() is used to return a safe buffer for * btf_show_start_type(); this ensures that as we recurse into * nested types we always have safe data for the given type. * This approach is somewhat wasteful; it's possible for example * that when iterating over a large union we'll end up copying the * same data repeatedly, but the goal is safety not performance. * We use stack data as opposed to per-CPU buffers because the * iteration over a type can take some time, and preemption handling * would greatly complicate use of the safe buffer. */ static void *btf_show_obj_safe(struct btf_show *show, const struct btf_type *t, void *data) { const struct btf_type *rt; int size_left, size; void *safe = NULL; if (show->flags & BTF_SHOW_UNSAFE) return data; rt = btf_resolve_size(show->btf, t, &size); if (IS_ERR(rt)) { show->state.status = PTR_ERR(rt); return NULL; } /* * Is this toplevel object? If so, set total object size and * initialize pointers. Otherwise check if we still fall within * our safe object data. */ if (show->state.depth == 0) { show->obj.size = size; show->obj.head = data; } else { /* * If the size of the current object is > our remaining * safe buffer we _may_ need to do a new copy. However * consider the case of a nested struct; it's size pushes * us over the safe buffer limit, but showing any individual * struct members does not. In such cases, we don't need * to initiate a fresh copy yet; however we definitely need * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left * in our buffer, regardless of the current object size. * The logic here is that as we resolve types we will * hit a base type at some point, and we need to be sure * the next chunk of data is safely available to display * that type info safely. We cannot rely on the size of * the current object here because it may be much larger * than our current buffer (e.g. task_struct is 8k). * All we want to do here is ensure that we can print the * next basic type, which we can if either * - the current type size is within the safe buffer; or * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in * the safe buffer. */ safe = __btf_show_obj_safe(show, data, min(size, BTF_SHOW_OBJ_BASE_TYPE_SIZE)); } /* * We need a new copy to our safe object, either because we haven't * yet copied and are initializing safe data, or because the data * we want falls outside the boundaries of the safe object. */ if (!safe) { size_left = btf_show_obj_size_left(show, data); if (size_left > BTF_SHOW_OBJ_SAFE_SIZE) size_left = BTF_SHOW_OBJ_SAFE_SIZE; show->state.status = copy_from_kernel_nofault(show->obj.safe, data, size_left); if (!show->state.status) { show->obj.data = data; safe = show->obj.safe; } } return safe; } /* * Set the type we are starting to show and return a safe data pointer * to be used for showing the associated data. */ static void *btf_show_start_type(struct btf_show *show, const struct btf_type *t, u32 type_id, void *data) { show->state.type = t; show->state.type_id = type_id; show->state.name[0] = '\0'; return btf_show_obj_safe(show, t, data); } static void btf_show_end_type(struct btf_show *show) { show->state.type = NULL; show->state.type_id = 0; show->state.name[0] = '\0'; } static void *btf_show_start_aggr_type(struct btf_show *show, const struct btf_type *t, u32 type_id, void *data) { void *safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return safe_data; btf_show(show, "%s%s%s", btf_show_indent(show), btf_show_name(show), btf_show_newline(show)); show->state.depth++; return safe_data; } static void btf_show_end_aggr_type(struct btf_show *show, const char *suffix) { show->state.depth--; btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix, btf_show_delim(show), btf_show_newline(show)); btf_show_end_type(show); } static void btf_show_start_member(struct btf_show *show, const struct btf_member *m) { show->state.member = m; } static void btf_show_start_array_member(struct btf_show *show) { show->state.array_member = 1; btf_show_start_member(show, NULL); } static void btf_show_end_member(struct btf_show *show) { show->state.member = NULL; } static void btf_show_end_array_member(struct btf_show *show) { show->state.array_member = 0; btf_show_end_member(show); } static void *btf_show_start_array_type(struct btf_show *show, const struct btf_type *t, u32 type_id, u16 array_encoding, void *data) { show->state.array_encoding = array_encoding; show->state.array_terminated = 0; return btf_show_start_aggr_type(show, t, type_id, data); } static void btf_show_end_array_type(struct btf_show *show) { show->state.array_encoding = 0; show->state.array_terminated = 0; btf_show_end_aggr_type(show, "]"); } static void *btf_show_start_struct_type(struct btf_show *show, const struct btf_type *t, u32 type_id, void *data) { return btf_show_start_aggr_type(show, t, type_id, data); } static void btf_show_end_struct_type(struct btf_show *show) { btf_show_end_aggr_type(show, "}"); } __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { va_list args; va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; va_list args; if (!bpf_verifier_log_needed(log)) return; va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, const struct btf_type *t, bool log_details, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; struct btf *btf = env->btf; va_list args; if (!bpf_verifier_log_needed(log)) return; if (log->level == BPF_LOG_KERNEL) { /* btf verifier prints all types it is processing via * btf_verifier_log_type(..., fmt = NULL). * Skip those prints for in-kernel BTF verification. */ if (!fmt) return; /* Skip logging when loading module BTF with mismatches permitted */ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) return; } __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); if (log_details) btf_type_ops(t)->log_details(env, t); if (fmt && *fmt) { __btf_verifier_log(log, " "); va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __btf_verifier_log(log, "\n"); } #define btf_verifier_log_type(env, t, ...) \ __btf_verifier_log_type((env), (t), true, __VA_ARGS__) #define btf_verifier_log_basic(env, t, ...) \ __btf_verifier_log_type((env), (t), false, __VA_ARGS__) __printf(4, 5) static void btf_verifier_log_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; struct btf *btf = env->btf; va_list args; if (!bpf_verifier_log_needed(log)) return; if (log->level == BPF_LOG_KERNEL) { if (!fmt) return; /* Skip logging when loading module BTF with mismatches permitted */ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) return; } /* The CHECK_META phase already did a btf dump. * * If member is logged again, it must hit an error in * parsing this member. It is useful to print out which * struct this member belongs to. */ if (env->phase != CHECK_META) btf_verifier_log_type(env, struct_type, NULL); if (btf_type_kflag(struct_type)) __btf_verifier_log(log, "\t%s type_id=%u bitfield_size=%u bits_offset=%u", __btf_name_by_offset(btf, member->name_off), member->type, BTF_MEMBER_BITFIELD_SIZE(member->offset), BTF_MEMBER_BIT_OFFSET(member->offset)); else __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", __btf_name_by_offset(btf, member->name_off), member->type, member->offset); if (fmt && *fmt) { __btf_verifier_log(log, " "); va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __btf_verifier_log(log, "\n"); } __printf(4, 5) static void btf_verifier_log_vsi(struct btf_verifier_env *env, const struct btf_type *datasec_type, const struct btf_var_secinfo *vsi, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; va_list args; if (!bpf_verifier_log_needed(log)) return; if (log->level == BPF_LOG_KERNEL && !fmt) return; if (env->phase != CHECK_META) btf_verifier_log_type(env, datasec_type, NULL); __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u", vsi->type, vsi->offset, vsi->size); if (fmt && *fmt) { __btf_verifier_log(log, " "); va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } __btf_verifier_log(log, "\n"); } static void btf_verifier_log_hdr(struct btf_verifier_env *env, u32 btf_data_size) { struct bpf_verifier_log *log = &env->log; const struct btf *btf = env->btf; const struct btf_header *hdr; if (!bpf_verifier_log_needed(log)) return; if (log->level == BPF_LOG_KERNEL) return; hdr = &btf->hdr; __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); __btf_verifier_log(log, "version: %u\n", hdr->version); __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); } static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) { struct btf *btf = env->btf; if (btf->types_size == btf->nr_types) { /* Expand 'types' array */ struct btf_type **new_types; u32 expand_by, new_size; if (btf->start_id + btf->types_size == BTF_MAX_TYPE) { btf_verifier_log(env, "Exceeded max num of types"); return -E2BIG; } expand_by = max_t(u32, btf->types_size >> 2, 16); new_size = min_t(u32, BTF_MAX_TYPE, btf->types_size + expand_by); new_types = kvcalloc(new_size, sizeof(*new_types), GFP_KERNEL | __GFP_NOWARN); if (!new_types) return -ENOMEM; if (btf->nr_types == 0) { if (!btf->base_btf) { /* lazily init VOID type */ new_types[0] = &btf_void; btf->nr_types++; } } else { memcpy(new_types, btf->types, sizeof(*btf->types) * btf->nr_types); } kvfree(btf->types); btf->types = new_types; btf->types_size = new_size; } btf->types[btf->nr_types++] = t; return 0; } static int btf_alloc_id(struct btf *btf) { int id; idr_preload(GFP_KERNEL); spin_lock_bh(&btf_idr_lock); id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC); if (id > 0) btf->id = id; spin_unlock_bh(&btf_idr_lock); idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; return id > 0 ? 0 : id; } static void btf_free_id(struct btf *btf) { unsigned long flags; /* * In map-in-map, calling map_delete_elem() on outer * map will call bpf_map_put on the inner map. * It will then eventually call btf_free_id() * on the inner map. Some of the map_delete_elem() * implementation may have irq disabled, so * we need to use the _irqsave() version instead * of the _bh() version. */ spin_lock_irqsave(&btf_idr_lock, flags); idr_remove(&btf_idr, btf->id); spin_unlock_irqrestore(&btf_idr_lock, flags); } static void btf_free_kfunc_set_tab(struct btf *btf) { struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab; int hook; if (!tab) return; for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) kfree(tab->sets[hook]); kfree(tab); btf->kfunc_set_tab = NULL; } static void btf_free_dtor_kfunc_tab(struct btf *btf) { struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; if (!tab) return; kfree(tab); btf->dtor_kfunc_tab = NULL; } static void btf_struct_metas_free(struct btf_struct_metas *tab) { int i; if (!tab) return; for (i = 0; i < tab->cnt; i++) btf_record_free(tab->types[i].record); kfree(tab); } static void btf_free_struct_meta_tab(struct btf *btf) { struct btf_struct_metas *tab = btf->struct_meta_tab; btf_struct_metas_free(tab); btf->struct_meta_tab = NULL; } static void btf_free_struct_ops_tab(struct btf *btf) { struct btf_struct_ops_tab *tab = btf->struct_ops_tab; u32 i; if (!tab) return; for (i = 0; i < tab->cnt; i++) bpf_struct_ops_desc_release(&tab->ops[i]); kfree(tab); btf->struct_ops_tab = NULL; } static void btf_free(struct btf *btf) { btf_free_struct_meta_tab(btf); btf_free_dtor_kfunc_tab(btf); btf_free_kfunc_set_tab(btf); btf_free_struct_ops_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); kvfree(btf->resolved_ids); /* vmlinux does not allocate btf->data, it simply points it at * __start_BTF. */ if (!btf_is_vmlinux(btf)) kvfree(btf->data); kvfree(btf->base_id_map); kfree(btf); } static void btf_free_rcu(struct rcu_head *rcu) { struct btf *btf = container_of(rcu, struct btf, rcu); btf_free(btf); } const char *btf_get_name(const struct btf *btf) { return btf->name; } void btf_get(struct btf *btf) { refcount_inc(&btf->refcnt); } void btf_put(struct btf *btf) { if (btf && refcount_dec_and_test(&btf->refcnt)) { btf_free_id(btf); call_rcu(&btf->rcu, btf_free_rcu); } } struct btf *btf_base_btf(const struct btf *btf) { return btf->base_btf; } const struct btf_header *btf_header(const struct btf *btf) { return &btf->hdr; } void btf_set_base_btf(struct btf *btf, const struct btf *base_btf) { btf->base_btf = (struct btf *)base_btf; btf->start_id = btf_nr_types(base_btf); btf->start_str_off = base_btf->hdr.str_len; } static int env_resolve_init(struct btf_verifier_env *env) { struct btf *btf = env->btf; u32 nr_types = btf->nr_types; u32 *resolved_sizes = NULL; u32 *resolved_ids = NULL; u8 *visit_states = NULL; resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes), GFP_KERNEL | __GFP_NOWARN); if (!resolved_sizes) goto nomem; resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids), GFP_KERNEL | __GFP_NOWARN); if (!resolved_ids) goto nomem; visit_states = kvcalloc(nr_types, sizeof(*visit_states), GFP_KERNEL | __GFP_NOWARN); if (!visit_states) goto nomem; btf->resolved_sizes = resolved_sizes; btf->resolved_ids = resolved_ids; env->visit_states = visit_states; return 0; nomem: kvfree(resolved_sizes); kvfree(resolved_ids); kvfree(visit_states); return -ENOMEM; } static void btf_verifier_env_free(struct btf_verifier_env *env) { kvfree(env->visit_states); kfree(env); } static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, const struct btf_type *next_type) { switch (env->resolve_mode) { case RESOLVE_TBD: /* int, enum or void is a sink */ return !btf_type_needs_resolve(next_type); case RESOLVE_PTR: /* int, enum, void, struct, array, func or func_proto is a sink * for ptr */ return !btf_type_is_modifier(next_type) && !btf_type_is_ptr(next_type); case RESOLVE_STRUCT_OR_ARRAY: /* int, enum, void, ptr, func or func_proto is a sink * for struct and array */ return !btf_type_is_modifier(next_type) && !btf_type_is_array(next_type) && !btf_type_is_struct(next_type); default: BUG(); } } static bool env_type_is_resolved(const struct btf_verifier_env *env, u32 type_id) { /* base BTF types should be resolved by now */ if (type_id < env->btf->start_id) return true; return env->visit_states[type_id - env->btf->start_id] == RESOLVED; } static int env_stack_push(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { const struct btf *btf = env->btf; struct resolve_vertex *v; if (env->top_stack == MAX_RESOLVE_DEPTH) return -E2BIG; if (type_id < btf->start_id || env->visit_states[type_id - btf->start_id] != NOT_VISITED) return -EEXIST; env->visit_states[type_id - btf->start_id] = VISITED; v = &env->stack[env->top_stack++]; v->t = t; v->type_id = type_id; v->next_member = 0; if (env->resolve_mode == RESOLVE_TBD) { if (btf_type_is_ptr(t)) env->resolve_mode = RESOLVE_PTR; else if (btf_type_is_struct(t) || btf_type_is_array(t)) env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY; } return 0; } static void env_stack_set_next_member(struct btf_verifier_env *env, u16 next_member) { env->stack[env->top_stack - 1].next_member = next_member; } static void env_stack_pop_resolved(struct btf_verifier_env *env, u32 resolved_type_id, u32 resolved_size) { u32 type_id = env->stack[--(env->top_stack)].type_id; struct btf *btf = env->btf; type_id -= btf->start_id; /* adjust to local type id */ btf->resolved_sizes[type_id] = resolved_size; btf->resolved_ids[type_id] = resolved_type_id; env->visit_states[type_id] = RESOLVED; } static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) { return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; } /* Resolve the size of a passed-in "type" * * type: is an array (e.g. u32 array[x][y]) * return type: type "u32[x][y]", i.e. BTF_KIND_ARRAY, * *type_size: (x * y * sizeof(u32)). Hence, *type_size always * corresponds to the return type. * *elem_type: u32 * *elem_id: id of u32 * *total_nelems: (x * y). Hence, individual elem size is * (*type_size / *total_nelems) * *type_id: id of type if it's changed within the function, 0 if not * * type: is not an array (e.g. const struct X) * return type: type "struct X" * *type_size: sizeof(struct X) * *elem_type: same as return type ("struct X") * *elem_id: 0 * *total_nelems: 1 * *type_id: id of type if it's changed within the function, 0 if not */ static const struct btf_type * __btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size, const struct btf_type **elem_type, u32 *elem_id, u32 *total_nelems, u32 *type_id) { const struct btf_type *array_type = NULL; const struct btf_array *array = NULL; u32 i, size, nelems = 1, id = 0; for (i = 0; i < MAX_RESOLVE_DEPTH; i++) { switch (BTF_INFO_KIND(type->info)) { /* type->size can be used */ case BTF_KIND_INT: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FLOAT: case BTF_KIND_ENUM64: size = type->size; goto resolved; case BTF_KIND_PTR: size = sizeof(void *); goto resolved; /* Modifiers */ case BTF_KIND_TYPEDEF: case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: case BTF_KIND_TYPE_TAG: id = type->type; type = btf_type_by_id(btf, type->type); break; case BTF_KIND_ARRAY: if (!array_type) array_type = type; array = btf_type_array(type); if (nelems && array->nelems > U32_MAX / nelems) return ERR_PTR(-EINVAL); nelems *= array->nelems; type = btf_type_by_id(btf, array->type); break; /* type without size */ default: return ERR_PTR(-EINVAL); } } return ERR_PTR(-EINVAL); resolved: if (nelems && size > U32_MAX / nelems) return ERR_PTR(-EINVAL); *type_size = nelems * size; if (total_nelems) *total_nelems = nelems; if (elem_type) *elem_type = type; if (elem_id) *elem_id = array ? array->type : 0; if (type_id && id) *type_id = id; return array_type ? : type; } const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size) { return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL); } static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id) { while (type_id < btf->start_id) btf = btf->base_btf; return btf->resolved_ids[type_id - btf->start_id]; } /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) { *type_id = btf_resolved_type_id(btf, *type_id); return btf_type_by_id(btf, *type_id); } static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id) { while (type_id < btf->start_id) btf = btf->base_btf; return btf->resolved_sizes[type_id - btf->start_id]; } const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size) { const struct btf_type *size_type; u32 size_type_id = *type_id; u32 size = 0; size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; if (btf_type_has_size(size_type)) { size = size_type->size; } else if (btf_type_is_array(size_type)) { size = btf_resolved_type_size(btf, size_type_id); } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && !btf_type_is_var(size_type))) return NULL; size_type_id = btf_resolved_type_id(btf, size_type_id); size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; else if (btf_type_has_size(size_type)) size = size_type->size; else if (btf_type_is_array(size_type)) size = btf_resolved_type_size(btf, size_type_id); else if (btf_type_is_ptr(size_type)) size = sizeof(void *); else return NULL; } *type_id = size_type_id; if (ret_size) *ret_size = size; return size_type; } static int btf_df_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { btf_verifier_log_basic(env, struct_type, "Unsupported check_member"); return -EINVAL; } static int btf_df_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { btf_verifier_log_basic(env, struct_type, "Unsupported check_kflag_member"); return -EINVAL; } /* Used for ptr, array struct/union and float type members. * int, enum and modifier types have their specific callback functions. */ static int btf_generic_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) { btf_verifier_log_member(env, struct_type, member, "Invalid member bitfield_size"); return -EINVAL; } /* bitfield size is 0, so member->offset represents bit offset only. * It is safe to call non kflag check_member variants. */ return btf_type_ops(member_type)->check_member(env, struct_type, member, member_type); } static int btf_df_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { btf_verifier_log_basic(env, v->t, "Unsupported resolve"); return -EINVAL; } static void btf_df_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offsets, struct btf_show *show) { btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info)); } static int btf_int_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 int_data = btf_type_int(member_type); u32 struct_bits_off = member->offset; u32 struct_size = struct_type->size; u32 nr_copy_bits; u32 bytes_offset; if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) { btf_verifier_log_member(env, struct_type, member, "bits_offset exceeds U32_MAX"); return -EINVAL; } struct_bits_off += BTF_INT_OFFSET(int_data); bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); nr_copy_bits = BTF_INT_BITS(int_data) + BITS_PER_BYTE_MASKED(struct_bits_off); if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, "nr_copy_bits exceeds 128"); return -EINVAL; } if (struct_size < bytes_offset || struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static int btf_int_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset; u32 int_data = btf_type_int(member_type); u32 struct_size = struct_type->size; u32 nr_copy_bits; /* a regular int type is required for the kflag int member */ if (!btf_type_int_is_regular(member_type)) { btf_verifier_log_member(env, struct_type, member, "Invalid member base type"); return -EINVAL; } /* check sanity of bitfield size */ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); nr_int_data_bits = BTF_INT_BITS(int_data); if (!nr_bits) { /* Not a bitfield member, member offset must be at byte * boundary. */ if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Invalid member offset"); return -EINVAL; } nr_bits = nr_int_data_bits; } else if (nr_bits > nr_int_data_bits) { btf_verifier_log_member(env, struct_type, member, "Invalid member bitfield_size"); return -EINVAL; } bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, "nr_copy_bits exceeds 128"); return -EINVAL; } if (struct_size < bytes_offset || struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static s32 btf_int_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { u32 int_data, nr_bits, meta_needed = sizeof(int_data); u16 encoding; if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } int_data = btf_type_int(t); if (int_data & ~BTF_INT_MASK) { btf_verifier_log_basic(env, t, "Invalid int_data:%x", int_data); return -EINVAL; } nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); if (nr_bits > BITS_PER_U128) { btf_verifier_log_type(env, t, "nr_bits exceeds %zu", BITS_PER_U128); return -EINVAL; } if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) { btf_verifier_log_type(env, t, "nr_bits exceeds type_size"); return -EINVAL; } /* * Only one of the encoding bits is allowed and it * should be sufficient for the pretty print purpose (i.e. decoding). * Multiple bits can be allowed later if it is found * to be insufficient. */ encoding = BTF_INT_ENCODING(int_data); if (encoding && encoding != BTF_INT_SIGNED && encoding != BTF_INT_CHAR && encoding != BTF_INT_BOOL) { btf_verifier_log_type(env, t, "Unsupported encoding"); return -ENOTSUPP; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static void btf_int_log(struct btf_verifier_env *env, const struct btf_type *t) { int int_data = btf_type_int(t); btf_verifier_log(env, "size=%u bits_offset=%u nr_bits=%u encoding=%s", t->size, BTF_INT_OFFSET(int_data), BTF_INT_BITS(int_data), btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } static void btf_int128_print(struct btf_show *show, void *data) { /* data points to a __int128 number. * Suppose * int128_num = *(__int128 *)data; * The below formulas shows what upper_num and lower_num represents: * upper_num = int128_num >> 64; * lower_num = int128_num & 0xffffffffFFFFFFFFULL; */ u64 upper_num, lower_num; #ifdef __BIG_ENDIAN_BITFIELD upper_num = *(u64 *)data; lower_num = *(u64 *)(data + 8); #else upper_num = *(u64 *)(data + 8); lower_num = *(u64 *)data; #endif if (upper_num == 0) btf_show_type_value(show, "0x%llx", lower_num); else btf_show_type_values(show, "0x%llx%016llx", upper_num, lower_num); } static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, u16 right_shift_bits) { u64 upper_num, lower_num; #ifdef __BIG_ENDIAN_BITFIELD upper_num = print_num[0]; lower_num = print_num[1]; #else upper_num = print_num[1]; lower_num = print_num[0]; #endif /* shake out un-needed bits by shift/or operations */ if (left_shift_bits >= 64) { upper_num = lower_num << (left_shift_bits - 64); lower_num = 0; } else { upper_num = (upper_num << left_shift_bits) | (lower_num >> (64 - left_shift_bits)); lower_num = lower_num << left_shift_bits; } if (right_shift_bits >= 64) { lower_num = upper_num >> (right_shift_bits - 64); upper_num = 0; } else { lower_num = (lower_num >> right_shift_bits) | (upper_num << (64 - right_shift_bits)); upper_num = upper_num >> right_shift_bits; } #ifdef __BIG_ENDIAN_BITFIELD print_num[0] = upper_num; print_num[1] = lower_num; #else print_num[0] = lower_num; print_num[1] = upper_num; #endif } static void btf_bitfield_show(void *data, u8 bits_offset, u8 nr_bits, struct btf_show *show) { u16 left_shift_bits, right_shift_bits; u8 nr_copy_bytes; u8 nr_copy_bits; u64 print_num[2] = {}; nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); memcpy(print_num, data, nr_copy_bytes); #ifdef __BIG_ENDIAN_BITFIELD left_shift_bits = bits_offset; #else left_shift_bits = BITS_PER_U128 - nr_copy_bits; #endif right_shift_bits = BITS_PER_U128 - nr_bits; btf_int128_shift(print_num, left_shift_bits, right_shift_bits); btf_int128_print(show, print_num); } static void btf_int_bits_show(const struct btf *btf, const struct btf_type *t, void *data, u8 bits_offset, struct btf_show *show) { u32 int_data = btf_type_int(t); u8 nr_bits = BTF_INT_BITS(int_data); u8 total_bits_offset; /* * bits_offset is at most 7. * BTF_INT_OFFSET() cannot exceed 128 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); btf_bitfield_show(data, bits_offset, nr_bits, show); } static void btf_int_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; u8 nr_bits = BTF_INT_BITS(int_data); void *safe_data; safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return; if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { btf_int_bits_show(btf, t, safe_data, bits_offset, show); goto out; } switch (nr_bits) { case 128: btf_int128_print(show, safe_data); break; case 64: if (sign) btf_show_type_value(show, "%lld", *(s64 *)safe_data); else btf_show_type_value(show, "%llu", *(u64 *)safe_data); break; case 32: if (sign) btf_show_type_value(show, "%d", *(s32 *)safe_data); else btf_show_type_value(show, "%u", *(u32 *)safe_data); break; case 16: if (sign) btf_show_type_value(show, "%d", *(s16 *)safe_data); else btf_show_type_value(show, "%u", *(u16 *)safe_data); break; case 8: if (show->state.array_encoding == BTF_INT_CHAR) { /* check for null terminator */ if (show->state.array_terminated) break; if (*(char *)data == '\0') { show->state.array_terminated = 1; break; } if (isprint(*(char *)data)) { btf_show_type_value(show, "'%c'", *(char *)safe_data); break; } } if (sign) btf_show_type_value(show, "%d", *(s8 *)safe_data); else btf_show_type_value(show, "%u", *(u8 *)safe_data); break; default: btf_int_bits_show(btf, t, safe_data, bits_offset, show); break; } out: btf_show_end_type(show); } static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, .resolve = btf_df_resolve, .check_member = btf_int_check_member, .check_kflag_member = btf_int_check_kflag_member, .log_details = btf_int_log, .show = btf_int_show, }; static int btf_modifier_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { const struct btf_type *resolved_type; u32 resolved_type_id = member->type; struct btf_member resolved_member; struct btf *btf = env->btf; resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); if (!resolved_type) { btf_verifier_log_member(env, struct_type, member, "Invalid member"); return -EINVAL; } resolved_member = *member; resolved_member.type = resolved_type_id; return btf_type_ops(resolved_type)->check_member(env, struct_type, &resolved_member, resolved_type); } static int btf_modifier_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { const struct btf_type *resolved_type; u32 resolved_type_id = member->type; struct btf_member resolved_member; struct btf *btf = env->btf; resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); if (!resolved_type) { btf_verifier_log_member(env, struct_type, member, "Invalid member"); return -EINVAL; } resolved_member = *member; resolved_member.type = resolved_type_id; return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type, &resolved_member, resolved_type); } static int btf_ptr_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_size, struct_bits_off, bytes_offset; struct_size = struct_type->size; struct_bits_off = member->offset; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } if (struct_size - bytes_offset < sizeof(void *)) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static int btf_ref_type_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const char *value; if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (!BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } /* typedef/type_tag type must have a valid name, and other ref types, * volatile, const, restrict, should have a null name. */ if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { if (!t->name_off || !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) { value = btf_name_by_offset(env->btf, t->name_off); if (!value || !value[0]) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } } else { if (t->name_off) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } } btf_verifier_log_type(env, t, NULL); return 0; } static int btf_modifier_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *t = v->t; const struct btf_type *next_type; u32 next_type_id = t->type; struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); /* Figure out the resolved next_type_id with size. * They will be stored in the current modifier's * resolved_ids and resolved_sizes such that it can * save us a few type-following when we use it later (e.g. in * pretty print). */ if (!btf_type_id_size(btf, &next_type_id, NULL)) { if (env_type_is_resolved(env, next_type_id)) next_type = btf_type_id_resolve(btf, &next_type_id); /* "typedef void new_void", "const void"...etc */ if (!btf_type_is_void(next_type) && !btf_type_is_fwd(next_type) && !btf_type_is_func_proto(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } } env_stack_pop_resolved(env, next_type_id, 0); return 0; } static int btf_var_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *next_type; const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); if (btf_type_is_modifier(next_type)) { const struct btf_type *resolved_type; u32 resolved_type_id; resolved_type_id = next_type_id; resolved_type = btf_type_id_resolve(btf, &resolved_type_id); if (btf_type_is_ptr(resolved_type) && !env_type_is_resolve_sink(env, resolved_type) && !env_type_is_resolved(env, resolved_type_id)) return env_stack_push(env, resolved_type, resolved_type_id); } /* We must resolve to something concrete at this point, no * forward types or similar that would resolve to size of * zero is allowed. */ if (!btf_type_id_size(btf, &next_type_id, NULL)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } env_stack_pop_resolved(env, next_type_id, 0); return 0; } static int btf_ptr_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *next_type; const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY, * the modifier may have stopped resolving when it was resolved * to a ptr (last-resolved-ptr). * * We now need to continue from the last-resolved-ptr to * ensure the last-resolved-ptr will not referring back to * the current ptr (t). */ if (btf_type_is_modifier(next_type)) { const struct btf_type *resolved_type; u32 resolved_type_id; resolved_type_id = next_type_id; resolved_type = btf_type_id_resolve(btf, &resolved_type_id); if (btf_type_is_ptr(resolved_type) && !env_type_is_resolve_sink(env, resolved_type) && !env_type_is_resolved(env, resolved_type_id)) return env_stack_push(env, resolved_type, resolved_type_id); } if (!btf_type_id_size(btf, &next_type_id, NULL)) { if (env_type_is_resolved(env, next_type_id)) next_type = btf_type_id_resolve(btf, &next_type_id); if (!btf_type_is_void(next_type) && !btf_type_is_fwd(next_type) && !btf_type_is_func_proto(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } } env_stack_pop_resolved(env, next_type_id, 0); return 0; } static void btf_modifier_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { if (btf->resolved_ids) t = btf_type_id_resolve(btf, &type_id); else t = btf_type_skip_modifiers(btf, type_id, NULL); btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } static void btf_var_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { t = btf_type_id_resolve(btf, &type_id); btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } static void btf_ptr_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { void *safe_data; safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return; /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */ if (show->flags & BTF_SHOW_PTR_RAW) btf_show_type_value(show, "0x%px", *(void **)safe_data); else btf_show_type_value(show, "0x%p", *(void **)safe_data); btf_show_end_type(show); } static void btf_ref_type_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "type_id=%u", t->type); } static const struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, .check_kflag_member = btf_modifier_check_kflag_member, .log_details = btf_ref_type_log, .show = btf_modifier_show, }; static const struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_ref_type_log, .show = btf_ptr_show, }; static s32 btf_fwd_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (t->type) { btf_verifier_log_type(env, t, "type != 0"); return -EINVAL; } /* fwd type must have a valid name */ if (!t->name_off || !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return 0; } static void btf_fwd_type_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); } static const struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_fwd_type_log, .show = btf_df_show, }; static int btf_array_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off = member->offset; u32 struct_size, bytes_offset; u32 array_type_id, array_size; struct btf *btf = env->btf; if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } array_type_id = member->type; btf_type_id_size(btf, &array_type_id, &array_size); struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); if (struct_size - bytes_offset < array_size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static s32 btf_array_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_array *array = btf_type_array(t); u32 meta_needed = sizeof(*array); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } /* array type should not have a name */ if (t->name_off) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (t->size) { btf_verifier_log_type(env, t, "size != 0"); return -EINVAL; } /* Array elem type and index type cannot be in type void, * so !array->type and !array->index_type are not allowed. */ if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { btf_verifier_log_type(env, t, "Invalid elem"); return -EINVAL; } if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { btf_verifier_log_type(env, t, "Invalid index"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static int btf_array_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_array *array = btf_type_array(v->t); const struct btf_type *elem_type, *index_type; u32 elem_type_id, index_type_id; struct btf *btf = env->btf; u32 elem_size; /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); if (btf_type_nosize_or_null(index_type) || btf_type_is_resolve_source_only(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } if (!env_type_is_resolve_sink(env, index_type) && !env_type_is_resolved(env, index_type_id)) return env_stack_push(env, index_type, index_type_id); index_type = btf_type_id_size(btf, &index_type_id, NULL); if (!index_type || !btf_type_is_int(index_type) || !btf_type_int_is_regular(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); if (btf_type_nosize_or_null(elem_type) || btf_type_is_resolve_source_only(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; } if (!env_type_is_resolve_sink(env, elem_type) && !env_type_is_resolved(env, elem_type_id)) return env_stack_push(env, elem_type, elem_type_id); elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); if (!elem_type) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; } if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid array of int"); return -EINVAL; } if (array->nelems && elem_size > U32_MAX / array->nelems) { btf_verifier_log_type(env, v->t, "Array size overflows U32_MAX"); return -EINVAL; } env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems); return 0; } static void btf_array_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_array *array = btf_type_array(t); btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u", array->type, array->index_type, array->nelems); } static void __btf_array_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_array *array = btf_type_array(t); const struct btf_kind_operations *elem_ops; const struct btf_type *elem_type; u32 i, elem_size = 0, elem_type_id; u16 encoding = 0; elem_type_id = array->type; elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL); if (elem_type && btf_type_has_size(elem_type)) elem_size = elem_type->size; if (elem_type && btf_type_is_int(elem_type)) { u32 int_type = btf_type_int(elem_type); encoding = BTF_INT_ENCODING(int_type); /* * BTF_INT_CHAR encoding never seems to be set for * char arrays, so if size is 1 and element is * printable as a char, we'll do that. */ if (elem_size == 1) encoding = BTF_INT_CHAR; } if (!btf_show_start_array_type(show, t, type_id, encoding, data)) return; if (!elem_type) goto out; elem_ops = btf_type_ops(elem_type); for (i = 0; i < array->nelems; i++) { btf_show_start_array_member(show); elem_ops->show(btf, elem_type, elem_type_id, data, bits_offset, show); data += elem_size; btf_show_end_array_member(show); if (show->state.array_terminated) break; } out: btf_show_end_array_type(show); } static void btf_array_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_member *m = show->state.member; /* * First check if any members would be shown (are non-zero). * See comments above "struct btf_show" definition for more * details on how this works at a high-level. */ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { if (!show->state.depth_check) { show->state.depth_check = show->state.depth + 1; show->state.depth_to_show = 0; } __btf_array_show(btf, t, type_id, data, bits_offset, show); show->state.member = m; if (show->state.depth_check != show->state.depth + 1) return; show->state.depth_check = 0; if (show->state.depth_to_show <= show->state.depth) return; /* * Reaching here indicates we have recursed and found * non-zero array member(s). */ } __btf_array_show(btf, t, type_id, data, bits_offset, show); } static const struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_array_log, .show = btf_array_show, }; static int btf_struct_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off = member->offset; u32 struct_size, bytes_offset; if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static s32 btf_struct_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; const struct btf_member *member; u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; u32 offset; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } /* struct type either no name or a valid one */ if (t->name_off && !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); last_offset = 0; for_each_member(i, t, member) { if (!btf_name_offset_valid(btf, member->name_off)) { btf_verifier_log_member(env, t, member, "Invalid member name_offset:%u", member->name_off); return -EINVAL; } /* struct member either no name or a valid one */ if (member->name_off && !btf_name_valid_identifier(btf, member->name_off)) { btf_verifier_log_member(env, t, member, "Invalid name"); return -EINVAL; } /* A member cannot be in type void */ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { btf_verifier_log_member(env, t, member, "Invalid type_id"); return -EINVAL; } offset = __btf_member_bit_offset(t, member); if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; } /* * ">" instead of ">=" because the last member could be * "char a[0];" */ if (last_offset > offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; } if (BITS_ROUNDUP_BYTES(offset) > struct_size) { btf_verifier_log_member(env, t, member, "Member bits_offset exceeds its struct size"); return -EINVAL; } btf_verifier_log_member(env, t, member, NULL); last_offset = offset; } return meta_needed; } static int btf_struct_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_member *member; int err; u16 i; /* Before continue resolving the next_member, * ensure the last member is indeed resolved to a * type with size info. */ if (v->next_member) { const struct btf_type *last_member_type; const struct btf_member *last_member; u32 last_member_type_id; last_member = btf_type_member(v->t) + v->next_member - 1; last_member_type_id = last_member->type; if (WARN_ON_ONCE(!env_type_is_resolved(env, last_member_type_id))) return -EINVAL; last_member_type = btf_type_by_id(env->btf, last_member_type_id); if (btf_type_kflag(v->t)) err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t, last_member, last_member_type); else err = btf_type_ops(last_member_type)->check_member(env, v->t, last_member, last_member_type); if (err) return err; } for_each_member_from(i, v->next_member, v->t, member) { u32 member_type_id = member->type; const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); if (btf_type_nosize_or_null(member_type) || btf_type_is_resolve_source_only(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; } if (!env_type_is_resolve_sink(env, member_type) && !env_type_is_resolved(env, member_type_id)) { env_stack_set_next_member(env, i + 1); return env_stack_push(env, member_type, member_type_id); } if (btf_type_kflag(v->t)) err = btf_type_ops(member_type)->check_kflag_member(env, v->t, member, member_type); else err = btf_type_ops(member_type)->check_member(env, v->t, member, member_type); if (err) return err; } env_stack_pop_resolved(env, 0, 0); return 0; } static void btf_struct_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } enum { BTF_FIELD_IGNORE = 0, BTF_FIELD_FOUND = 1, }; struct btf_field_info { enum btf_field_type type; u32 off; union { struct { u32 type_id; } kptr; struct { const char *node_name; u32 value_btf_id; } graph_root; }; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, u32 off, int sz, enum btf_field_type field_type, struct btf_field_info *info) { if (!__btf_type_is_struct(t)) return BTF_FIELD_IGNORE; if (t->size != sz) return BTF_FIELD_IGNORE; info->type = field_type; info->off = off; return BTF_FIELD_FOUND; } static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; const char *tag_value; bool is_type_tag; u32 res_id; /* Permit modifiers on the pointer itself */ if (btf_type_is_volatile(t)) t = btf_type_by_id(btf, t->type); /* For PTR, sz is always == 8 */ if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; t = btf_type_by_id(btf, t->type); is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t); if (!is_type_tag) return BTF_FIELD_IGNORE; /* Reject extra tags */ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) return -EINVAL; tag_value = __btf_name_by_offset(btf, t->name_off); if (!strcmp("kptr_untrusted", tag_value)) type = BPF_KPTR_UNREF; else if (!strcmp("kptr", tag_value)) type = BPF_KPTR_REF; else if (!strcmp("percpu_kptr", tag_value)) type = BPF_KPTR_PERCPU; else if (!strcmp("uptr", tag_value)) type = BPF_UPTR; else return -EINVAL; if (!(type & field_mask)) return BTF_FIELD_IGNORE; /* Get the base type */ t = btf_type_skip_modifiers(btf, t->type, &res_id); /* Only pointer to struct is allowed */ if (!__btf_type_is_struct(t)) return -EINVAL; info->type = type; info->off = off; info->kptr.type_id = res_id; return BTF_FIELD_FOUND; } int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt, int comp_idx, const char *tag_key, int last_id) { int len = strlen(tag_key); int i, n; for (i = last_id + 1, n = btf_nr_types(btf); i < n; i++) { const struct btf_type *t = btf_type_by_id(btf, i); if (!btf_type_is_decl_tag(t)) continue; if (pt != btf_type_by_id(btf, t->type)) continue; if (btf_type_decl_tag(t)->component_idx != comp_idx) continue; if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len)) continue; return i; } return -ENOENT; } const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, int comp_idx, const char *tag_key) { const char *value = NULL; const struct btf_type *t; int len, id; id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0); if (id < 0) return ERR_PTR(id); t = btf_type_by_id(btf, id); len = strlen(tag_key); value = __btf_name_by_offset(btf, t->name_off) + len; /* Prevent duplicate entries for same type */ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, id); if (id >= 0) return ERR_PTR(-EEXIST); return value; } static int btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, const struct btf_type *t, int comp_idx, u32 off, int sz, struct btf_field_info *info, enum btf_field_type head_type) { const char *node_field_name; const char *value_type; s32 id; if (!__btf_type_is_struct(t)) return BTF_FIELD_IGNORE; if (t->size != sz) return BTF_FIELD_IGNORE; value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); if (IS_ERR(value_type)) return -EINVAL; node_field_name = strstr(value_type, ":"); if (!node_field_name) return -EINVAL; value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!value_type) return -ENOMEM; id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); kfree(value_type); if (id < 0) return id; node_field_name++; if (str_is_empty(node_field_name)) return -EINVAL; info->type = head_type; info->off = off; info->graph_root.value_btf_id = id; info->graph_root.node_name = node_field_name; return BTF_FIELD_FOUND; } #define field_mask_test_name(field_type, field_type_str) \ if (field_mask & field_type && !strcmp(name, field_type_str)) { \ type = field_type; \ goto end; \ } static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type, u32 field_mask, u32 *seen_mask, int *align, int *sz) { int type = 0; const char *name = __btf_name_by_offset(btf, var_type->name_off); if (field_mask & BPF_SPIN_LOCK) { if (!strcmp(name, "bpf_spin_lock")) { if (*seen_mask & BPF_SPIN_LOCK) return -E2BIG; *seen_mask |= BPF_SPIN_LOCK; type = BPF_SPIN_LOCK; goto end; } } if (field_mask & BPF_RES_SPIN_LOCK) { if (!strcmp(name, "bpf_res_spin_lock")) { if (*seen_mask & BPF_RES_SPIN_LOCK) return -E2BIG; *seen_mask |= BPF_RES_SPIN_LOCK; type = BPF_RES_SPIN_LOCK; goto end; } } if (field_mask & BPF_TIMER) { if (!strcmp(name, "bpf_timer")) { if (*seen_mask & BPF_TIMER) return -E2BIG; *seen_mask |= BPF_TIMER; type = BPF_TIMER; goto end; } } if (field_mask & BPF_WORKQUEUE) { if (!strcmp(name, "bpf_wq")) { if (*seen_mask & BPF_WORKQUEUE) return -E2BIG; *seen_mask |= BPF_WORKQUEUE; type = BPF_WORKQUEUE; goto end; } } field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { type = BPF_KPTR_REF; goto end; } return 0; end: *sz = btf_field_type_size(type); *align = btf_field_type_align(type); return type; } #undef field_mask_test_name /* Repeat a number of fields for a specified number of times. * * Copy the fields starting from the first field and repeat them for * repeat_cnt times. The fields are repeated by adding the offset of each * field with * (i + 1) * elem_size * where i is the repeat index and elem_size is the size of an element. */ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { u32 i, j; u32 cur; /* Ensure not repeating fields that should not be repeated. */ for (i = 0; i < field_cnt; i++) { switch (info[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: case BPF_LIST_HEAD: case BPF_RB_ROOT: break; default: return -EINVAL; } } /* The type of struct size or variable size is u32, * so the multiplication will not overflow. */ if (field_cnt * (repeat_cnt + 1) > info_cnt) return -E2BIG; cur = field_cnt; for (i = 0; i < repeat_cnt; i++) { memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); for (j = 0; j < field_cnt; j++) info[cur++].off += (i + 1) * elem_size; } return 0; } static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt, u32 level); /* Find special fields in the struct type of a field. * * This function is used to find fields of special types that is not a * global variable or a direct field of a struct type. It also handles the * repetition if it is the element type of an array. */ static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t, u32 off, u32 nelems, u32 field_mask, struct btf_field_info *info, int info_cnt, u32 level) { int ret, err, i; level++; if (level >= MAX_RESOLVE_DEPTH) return -E2BIG; ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level); if (ret <= 0) return ret; /* Shift the offsets of the nested struct fields to the offsets * related to the container. */ for (i = 0; i < ret; i++) info[i].off += off; if (nelems > 1) { err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size); if (err == 0) ret *= nelems; else ret = err; } return ret; } static int btf_find_field_one(const struct btf *btf, const struct btf_type *var, const struct btf_type *var_type, int var_idx, u32 off, u32 expected_size, u32 field_mask, u32 *seen_mask, struct btf_field_info *info, int info_cnt, u32 level) { int ret, align, sz, field_type; struct btf_field_info tmp; const struct btf_array *array; u32 i, nelems = 1; /* Walk into array types to find the element type and the number of * elements in the (flattened) array. */ for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) { array = btf_array(var_type); nelems *= array->nelems; var_type = btf_type_by_id(btf, array->type); } if (i == MAX_RESOLVE_DEPTH) return -E2BIG; if (nelems == 0) return 0; field_type = btf_get_field_type(btf, var_type, field_mask, seen_mask, &align, &sz); /* Look into variables of struct types */ if (!field_type && __btf_type_is_struct(var_type)) { sz = var_type->size; if (expected_size && expected_size != sz * nelems) return 0; ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask, &info[0], info_cnt, level); return ret; } if (field_type == 0) return 0; if (field_type < 0) return field_type; if (expected_size && expected_size != sz * nelems) return 0; if (off % align) return 0; switch (field_type) { case BPF_SPIN_LOCK: case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) return ret; break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: ret = btf_find_kptr(btf, var_type, off, sz, info_cnt ? &info[0] : &tmp, field_mask); if (ret < 0) return ret; break; case BPF_LIST_HEAD: case BPF_RB_ROOT: ret = btf_find_graph_root(btf, var, var_type, var_idx, off, sz, info_cnt ? &info[0] : &tmp, field_type); if (ret < 0) return ret; break; default: return -EFAULT; } if (ret == BTF_FIELD_IGNORE) return 0; if (!info_cnt) return -E2BIG; if (nelems > 1) { ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz); if (ret < 0) return ret; } return nelems; } static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt, u32 level) { int ret, idx = 0; const struct btf_member *member; u32 i, off, seen_mask = 0; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ return -EINVAL; off /= 8; ret = btf_find_field_one(btf, t, member_type, i, off, 0, field_mask, &seen_mask, &info[idx], info_cnt - idx, level); if (ret < 0) return ret; idx += ret; } return idx; } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt, u32 level) { int ret, idx = 0; const struct btf_var_secinfo *vsi; u32 i, off, seen_mask = 0; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); off = vsi->offset; ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size, field_mask, &seen_mask, &info[idx], info_cnt - idx, level); if (ret < 0) return ret; idx += ret; } return idx; } static int btf_find_field(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt) { if (__btf_type_is_struct(t)) return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0); else if (btf_type_is_datasec(t)) return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0); return -EINVAL; } /* Callers have to ensure the life cycle of btf if it is program BTF */ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { struct module *mod = NULL; const struct btf_type *t; /* If a matching btf type is found in kernel or module BTFs, kptr_ref * is that BTF, otherwise it's program BTF */ struct btf *kptr_btf; int ret; s32 id; /* Find type in map BTF, and use it to look up the matching type * in vmlinux or module BTFs, by name and kind. */ t = btf_type_by_id(btf, info->kptr.type_id); id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), &kptr_btf); if (id == -ENOENT) { /* btf_parse_kptr should only be called w/ btf = program BTF */ WARN_ON_ONCE(btf_is_kernel(btf)); /* Type exists only in program BTF. Assume that it's a MEM_ALLOC * kptr allocated via bpf_obj_new */ field->kptr.dtor = NULL; id = info->kptr.type_id; kptr_btf = (struct btf *)btf; goto found_dtor; } if (id < 0) return id; /* Find and stash the function pointer for the destruction function that * needs to be eventually invoked from the map free path. */ if (info->type == BPF_KPTR_REF) { const struct btf_type *dtor_func; const char *dtor_func_name; unsigned long addr; s32 dtor_btf_id; /* This call also serves as a whitelist of allowed objects that * can be used as a referenced pointer and be stored in a map at * the same time. */ dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id); if (dtor_btf_id < 0) { ret = dtor_btf_id; goto end_btf; } dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id); if (!dtor_func) { ret = -ENOENT; goto end_btf; } if (btf_is_module(kptr_btf)) { mod = btf_try_get_module(kptr_btf); if (!mod) { ret = -ENXIO; goto end_btf; } } /* We already verified dtor_func to be btf_type_is_func * in register_btf_id_dtor_kfuncs. */ dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off); addr = kallsyms_lookup_name(dtor_func_name); if (!addr) { ret = -EINVAL; goto end_mod; } field->kptr.dtor = (void *)addr; } found_dtor: field->kptr.btf_id = id; field->kptr.btf = kptr_btf; field->kptr.module = mod; return 0; end_mod: module_put(mod); end_btf: btf_put(kptr_btf); return ret; } static int btf_parse_graph_root(const struct btf *btf, struct btf_field *field, struct btf_field_info *info, const char *node_type_name, size_t node_type_align) { const struct btf_type *t, *n = NULL; const struct btf_member *member; u32 offset; int i; t = btf_type_by_id(btf, info->graph_root.value_btf_id); /* We've already checked that value_btf_id is a struct type. We * just need to figure out the offset of the list_node, and * verify its type. */ for_each_member(i, t, member) { if (strcmp(info->graph_root.node_name, __btf_name_by_offset(btf, member->name_off))) continue; /* Invalid BTF, two members with same name */ if (n) return -EINVAL; n = btf_type_by_id(btf, member->type); if (!__btf_type_is_struct(n)) return -EINVAL; if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off))) return -EINVAL; offset = __btf_member_bit_offset(n, member); if (offset % 8) return -EINVAL; offset /= 8; if (offset % node_type_align) return -EINVAL; field->graph_root.btf = (struct btf *)btf; field->graph_root.value_btf_id = info->graph_root.value_btf_id; field->graph_root.node_offset = offset; } if (!n) return -ENOENT; return 0; } static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { return btf_parse_graph_root(btf, field, info, "bpf_list_node", __alignof__(struct bpf_list_node)); } static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { return btf_parse_graph_root(btf, field, info, "bpf_rb_node", __alignof__(struct bpf_rb_node)); } static int btf_field_cmp(const void *_a, const void *_b, const void *priv) { const struct btf_field *a = (const struct btf_field *)_a; const struct btf_field *b = (const struct btf_field *)_b; if (a->offset < b->offset) return -1; else if (a->offset > b->offset) return 1; return 0; } struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size) { struct btf_field_info info_arr[BTF_FIELDS_MAX]; u32 next_off = 0, field_type_size; struct btf_record *rec; int ret, i, cnt; ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); if (ret < 0) return ERR_PTR(ret); if (!ret) return NULL; cnt = ret; /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); rec->spin_lock_off = -EINVAL; rec->res_spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size); ret = -EFAULT; goto end; } if (info_arr[i].off < next_off) { ret = -EEXIST; goto end; } next_off = info_arr[i].off + field_type_size; rec->field_mask |= info_arr[i].type; rec->fields[i].offset = info_arr[i].off; rec->fields[i].type = info_arr[i].type; rec->fields[i].size = field_type_size; switch (info_arr[i].type) { case BPF_SPIN_LOCK: WARN_ON_ONCE(rec->spin_lock_off >= 0); /* Cache offset for faster lookup at runtime */ rec->spin_lock_off = rec->fields[i].offset; break; case BPF_RES_SPIN_LOCK: WARN_ON_ONCE(rec->spin_lock_off >= 0); /* Cache offset for faster lookup at runtime */ rec->res_spin_lock_off = rec->fields[i].offset; break; case BPF_TIMER: WARN_ON_ONCE(rec->timer_off >= 0); /* Cache offset for faster lookup at runtime */ rec->timer_off = rec->fields[i].offset; break; case BPF_WORKQUEUE: WARN_ON_ONCE(rec->wq_off >= 0); /* Cache offset for faster lookup at runtime */ rec->wq_off = rec->fields[i].offset; break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ rec->refcount_off = rec->fields[i].offset; break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; break; case BPF_LIST_HEAD: ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; break; case BPF_RB_ROOT: ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; break; case BPF_LIST_NODE: case BPF_RB_NODE: break; default: ret = -EFAULT; goto end; } rec->cnt++; } if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) { ret = -EINVAL; goto end; } /* bpf_{list_head, rb_node} require bpf_spin_lock */ if ((btf_record_has_field(rec, BPF_LIST_HEAD) || btf_record_has_field(rec, BPF_RB_ROOT)) && (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) { ret = -EINVAL; goto end; } if (rec->refcount_off < 0 && btf_record_has_field(rec, BPF_LIST_NODE) && btf_record_has_field(rec, BPF_RB_NODE)) { ret = -EINVAL; goto end; } sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp, NULL, rec); return rec; end: btf_record_free(rec); return ERR_PTR(ret); } int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) { int i; /* There are three types that signify ownership of some other type: * kptr_ref, bpf_list_head, bpf_rb_root. * kptr_ref only supports storing kernel types, which can't store * references to program allocated local types. * * Hence we only need to ensure that bpf_{list_head,rb_root} ownership * does not form cycles. */ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR))) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; const struct btf_type *t; u32 btf_id; if (rec->fields[i].type == BPF_UPTR) { /* The uptr only supports pinning one page and cannot * point to a kernel struct */ if (btf_is_kernel(rec->fields[i].kptr.btf)) return -EINVAL; t = btf_type_by_id(rec->fields[i].kptr.btf, rec->fields[i].kptr.btf_id); if (!t->size) return -EINVAL; if (t->size > PAGE_SIZE) return -E2BIG; continue; } if (!(rec->fields[i].type & BPF_GRAPH_ROOT)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; meta = btf_find_struct_meta(btf, btf_id); if (!meta) return -EFAULT; rec->fields[i].graph_root.value_rec = meta->record; /* We need to set value_rec for all root types, but no need * to check ownership cycle for a type unless it's also a * node type. */ if (!(rec->field_mask & BPF_GRAPH_NODE)) continue; /* We need to ensure ownership acyclicity among all types. The * proper way to do it would be to topologically sort all BTF * IDs based on the ownership edges, since there can be multiple * bpf_{list_head,rb_node} in a type. Instead, we use the * following resaoning: * * - A type can only be owned by another type in user BTF if it * has a bpf_{list,rb}_node. Let's call these node types. * - A type can only _own_ another type in user BTF if it has a * bpf_{list_head,rb_root}. Let's call these root types. * * We ensure that if a type is both a root and node, its * element types cannot be root types. * * To ensure acyclicity: * * When A is an root type but not a node, its ownership * chain can be: * A -> B -> C * Where: * - A is an root, e.g. has bpf_rb_root. * - B is both a root and node, e.g. has bpf_rb_node and * bpf_list_head. * - C is only an root, e.g. has bpf_list_node * * When A is both a root and node, some other type already * owns it in the BTF domain, hence it can not own * another root type through any of the ownership edges. * A -> B * Where: * - A is both an root and node. * - B is only an node. */ if (meta->record->field_mask & BPF_GRAPH_ROOT) return -ELOOP; } return 0; } static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_member *member; void *safe_data; u32 i; safe_data = btf_show_start_struct_type(show, t, type_id, data); if (!safe_data) return; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); const struct btf_kind_operations *ops; u32 member_offset, bitfield_size; u32 bytes_offset; u8 bits8_offset; btf_show_start_member(show, member); member_offset = __btf_member_bit_offset(t, member); bitfield_size = __btf_member_bitfield_size(t, member); bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { safe_data = btf_show_start_type(show, member_type, member->type, data + bytes_offset); if (safe_data) btf_bitfield_show(safe_data, bits8_offset, bitfield_size, show); btf_show_end_type(show); } else { ops = btf_type_ops(member_type); ops->show(btf, member_type, member->type, data + bytes_offset, bits8_offset, show); } btf_show_end_member(show); } btf_show_end_struct_type(show); } static void btf_struct_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_member *m = show->state.member; /* * First check if any members would be shown (are non-zero). * See comments above "struct btf_show" definition for more * details on how this works at a high-level. */ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { if (!show->state.depth_check) { show->state.depth_check = show->state.depth + 1; show->state.depth_to_show = 0; } __btf_struct_show(btf, t, type_id, data, bits_offset, show); /* Restore saved member data here */ show->state.member = m; if (show->state.depth_check != show->state.depth + 1) return; show->state.depth_check = 0; if (show->state.depth_to_show <= show->state.depth) return; /* * Reaching here indicates we have recursed and found * non-zero child values. */ } __btf_struct_show(btf, t, type_id, data, bits_offset, show); } static const struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_struct_log, .show = btf_struct_show, }; static int btf_enum_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off = member->offset; u32 struct_size, bytes_offset; if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static int btf_enum_check_kflag_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u32 struct_bits_off, nr_bits, bytes_end, struct_size; u32 int_bitsize = sizeof(int) * BITS_PER_BYTE; struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); if (!nr_bits) { if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); return -EINVAL; } nr_bits = int_bitsize; } else if (nr_bits > int_bitsize) { btf_verifier_log_member(env, struct_type, member, "Invalid member bitfield_size"); return -EINVAL; } struct_size = struct_type->size; bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits); if (struct_size < bytes_end) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; const char *fmt_str; u16 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); meta_needed = nr_enums * sizeof(*enums); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (t->size > 8 || !is_power_of_2(t->size)) { btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; } /* enum type either no name or a valid one */ if (t->name_off && !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { if (!btf_name_offset_valid(btf, enums[i].name_off)) { btf_verifier_log(env, "\tInvalid name_offset:%u", enums[i].name_off); return -EINVAL; } /* enum member must have a valid name */ if (!enums[i].name_off || !btf_name_valid_identifier(btf, enums[i].name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (env->log.level == BPF_LOG_KERNEL) continue; fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n"; btf_verifier_log(env, fmt_str, __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } return meta_needed; } static void btf_enum_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } static void btf_enum_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_enum *enums = btf_type_enum(t); u32 i, nr_enums = btf_type_vlen(t); void *safe_data; int v; safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return; v = *(int *)safe_data; for (i = 0; i < nr_enums; i++) { if (v != enums[i].val) continue; btf_show_type_value(show, "%s", __btf_name_by_offset(btf, enums[i].name_off)); btf_show_end_type(show); return; } if (btf_type_kflag(t)) btf_show_type_value(show, "%d", v); else btf_show_type_value(show, "%u", v); btf_show_end_type(show); } static const struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, .show = btf_enum_show, }; static s32 btf_enum64_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_enum64 *enums = btf_type_enum64(t); struct btf *btf = env->btf; const char *fmt_str; u16 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); meta_needed = nr_enums * sizeof(*enums); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (t->size > 8 || !is_power_of_2(t->size)) { btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; } /* enum type either no name or a valid one */ if (t->name_off && !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { if (!btf_name_offset_valid(btf, enums[i].name_off)) { btf_verifier_log(env, "\tInvalid name_offset:%u", enums[i].name_off); return -EINVAL; } /* enum member must have a valid name */ if (!enums[i].name_off || !btf_name_valid_identifier(btf, enums[i].name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (env->log.level == BPF_LOG_KERNEL) continue; fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n"; btf_verifier_log(env, fmt_str, __btf_name_by_offset(btf, enums[i].name_off), btf_enum64_value(enums + i)); } return meta_needed; } static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_enum64 *enums = btf_type_enum64(t); u32 i, nr_enums = btf_type_vlen(t); void *safe_data; s64 v; safe_data = btf_show_start_type(show, t, type_id, data); if (!safe_data) return; v = *(u64 *)safe_data; for (i = 0; i < nr_enums; i++) { if (v != btf_enum64_value(enums + i)) continue; btf_show_type_value(show, "%s", __btf_name_by_offset(btf, enums[i].name_off)); btf_show_end_type(show); return; } if (btf_type_kflag(t)) btf_show_type_value(show, "%lld", v); else btf_show_type_value(show, "%llu", v); btf_show_end_type(show); } static const struct btf_kind_operations enum64_ops = { .check_meta = btf_enum64_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, .show = btf_enum64_show, }; static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (t->name_off) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static void btf_func_proto_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_param *args = (const struct btf_param *)(t + 1); u16 nr_args = btf_type_vlen(t), i; btf_verifier_log(env, "return=%u args=(", t->type); if (!nr_args) { btf_verifier_log(env, "void"); goto done; } if (nr_args == 1 && !args[0].type) { /* Only one vararg */ btf_verifier_log(env, "vararg"); goto done; } btf_verifier_log(env, "%u %s", args[0].type, __btf_name_by_offset(env->btf, args[0].name_off)); for (i = 1; i < nr_args - 1; i++) btf_verifier_log(env, ", %u %s", args[i].type, __btf_name_by_offset(env->btf, args[i].name_off)); if (nr_args > 1) { const struct btf_param *last_arg = &args[nr_args - 1]; if (last_arg->type) btf_verifier_log(env, ", %u %s", last_arg->type, __btf_name_by_offset(env->btf, last_arg->name_off)); else btf_verifier_log(env, ", vararg"); } done: btf_verifier_log(env, ")"); } static const struct btf_kind_operations func_proto_ops = { .check_meta = btf_func_proto_check_meta, .resolve = btf_df_resolve, /* * BTF_KIND_FUNC_PROTO cannot be directly referred by * a struct's member. * * It should be a function pointer instead. * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO) * * Hence, there is no btf_func_check_member(). */ .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_func_proto_log, .show = btf_df_show, }; static s32 btf_func_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { if (!t->name_off || !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) { btf_verifier_log_type(env, t, "Invalid func linkage"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return 0; } static int btf_func_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *t = v->t; u32 next_type_id = t->type; int err; err = btf_func_check(env, t); if (err) return err; env_stack_pop_resolved(env, next_type_id, 0); return 0; } static const struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, .resolve = btf_func_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, .show = btf_df_show, }; static s32 btf_var_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_var *var; u32 meta_needed = sizeof(*var); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (!t->name_off || !btf_name_valid_identifier(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } /* A var cannot be in type void */ if (!t->type || !BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } var = btf_type_var(t); if (var->linkage != BTF_VAR_STATIC && var->linkage != BTF_VAR_GLOBAL_ALLOCATED) { btf_verifier_log_type(env, t, "Linkage not supported"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_var *var = btf_type_var(t); btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage); } static const struct btf_kind_operations var_ops = { .check_meta = btf_var_check_meta, .resolve = btf_var_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_var_log, .show = btf_var_show, }; static s32 btf_datasec_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_var_secinfo *vsi; u64 last_vsi_end_off = 0, sum = 0; u32 i, meta_needed; meta_needed = btf_type_vlen(t) * sizeof(*vsi); if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } if (!t->size) { btf_verifier_log_type(env, t, "size == 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (!t->name_off || !btf_name_valid_section(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); for_each_vsi(i, t, vsi) { /* A var cannot be in type void */ if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) { btf_verifier_log_vsi(env, t, vsi, "Invalid type_id"); return -EINVAL; } if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) { btf_verifier_log_vsi(env, t, vsi, "Invalid offset"); return -EINVAL; } if (!vsi->size || vsi->size > t->size) { btf_verifier_log_vsi(env, t, vsi, "Invalid size"); return -EINVAL; } last_vsi_end_off = vsi->offset + vsi->size; if (last_vsi_end_off > t->size) { btf_verifier_log_vsi(env, t, vsi, "Invalid offset+size"); return -EINVAL; } btf_verifier_log_vsi(env, t, vsi, NULL); sum += vsi->size; } if (t->size < sum) { btf_verifier_log_type(env, t, "Invalid btf_info size"); return -EINVAL; } return meta_needed; } static int btf_datasec_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_var_secinfo *vsi; struct btf *btf = env->btf; u16 i; env->resolve_mode = RESOLVE_TBD; for_each_vsi_from(i, v->next_member, v->t, vsi) { u32 var_type_id = vsi->type, type_id, type_size = 0; const struct btf_type *var_type = btf_type_by_id(env->btf, var_type_id); if (!var_type || !btf_type_is_var(var_type)) { btf_verifier_log_vsi(env, v->t, vsi, "Not a VAR kind member"); return -EINVAL; } if (!env_type_is_resolve_sink(env, var_type) && !env_type_is_resolved(env, var_type_id)) { env_stack_set_next_member(env, i + 1); return env_stack_push(env, var_type, var_type_id); } type_id = var_type->type; if (!btf_type_id_size(btf, &type_id, &type_size)) { btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); return -EINVAL; } if (vsi->size < type_size) { btf_verifier_log_vsi(env, v->t, vsi, "Invalid size"); return -EINVAL; } } env_stack_pop_resolved(env, 0, 0); return 0; } static void btf_datasec_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } static void btf_datasec_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) { const struct btf_var_secinfo *vsi; const struct btf_type *var; u32 i; if (!btf_show_start_type(show, t, type_id, data)) return; btf_show_type_value(show, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); for_each_vsi(i, t, vsi) { var = btf_type_by_id(btf, vsi->type); if (i) btf_show(show, ","); btf_type_ops(var)->show(btf, var, vsi->type, data + vsi->offset, bits_offset, show); } btf_show_end_type(show); } static const struct btf_kind_operations datasec_ops = { .check_meta = btf_datasec_check_meta, .resolve = btf_datasec_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_datasec_log, .show = btf_datasec_show, }; static s32 btf_float_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } if (btf_type_kflag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 && t->size != 16) { btf_verifier_log_type(env, t, "Invalid type_size"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return 0; } static int btf_float_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type) { u64 start_offset_bytes; u64 end_offset_bytes; u64 misalign_bits; u64 align_bytes; u64 align_bits; /* Different architectures have different alignment requirements, so * here we check only for the reasonable minimum. This way we ensure * that types after CO-RE can pass the kernel BTF verifier. */ align_bytes = min_t(u64, sizeof(void *), member_type->size); align_bits = align_bytes * BITS_PER_BYTE; div64_u64_rem(member->offset, align_bits, &misalign_bits); if (misalign_bits) { btf_verifier_log_member(env, struct_type, member, "Member is not properly aligned"); return -EINVAL; } start_offset_bytes = member->offset / BITS_PER_BYTE; end_offset_bytes = start_offset_bytes + member_type->size; if (end_offset_bytes > struct_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; } return 0; } static void btf_float_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "size=%u", t->size); } static const struct btf_kind_operations float_ops = { .check_meta = btf_float_check_meta, .resolve = btf_df_resolve, .check_member = btf_float_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_float_log, .show = btf_df_show, }; static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { const struct btf_decl_tag *tag; u32 meta_needed = sizeof(*tag); s32 component_idx; const char *value; if (meta_left < meta_needed) { btf_verifier_log_basic(env, t, "meta_left:%u meta_needed:%u", meta_left, meta_needed); return -EINVAL; } value = btf_name_by_offset(env->btf, t->name_off); if (!value || !value[0]) { btf_verifier_log_type(env, t, "Invalid value"); return -EINVAL; } if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } component_idx = btf_type_decl_tag(t)->component_idx; if (component_idx < -1) { btf_verifier_log_type(env, t, "Invalid component_idx"); return -EINVAL; } btf_verifier_log_type(env, t, NULL); return meta_needed; } static int btf_decl_tag_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_type *next_type; const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; s32 component_idx; u32 vlen; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || !btf_type_is_decl_tag_target(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); component_idx = btf_type_decl_tag(t)->component_idx; if (component_idx != -1) { if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) { btf_verifier_log_type(env, v->t, "Invalid component_idx"); return -EINVAL; } if (btf_type_is_struct(next_type)) { vlen = btf_type_vlen(next_type); } else { /* next_type should be a function */ next_type = btf_type_by_id(btf, next_type->type); vlen = btf_type_vlen(next_type); } if ((u32)component_idx >= vlen) { btf_verifier_log_type(env, v->t, "Invalid component_idx"); return -EINVAL; } } env_stack_pop_resolved(env, next_type_id, 0); return 0; } static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t) { btf_verifier_log(env, "type=%u component_idx=%d", t->type, btf_type_decl_tag(t)->component_idx); } static const struct btf_kind_operations decl_tag_ops = { .check_meta = btf_decl_tag_check_meta, .resolve = btf_decl_tag_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_decl_tag_log, .show = btf_df_show, }; static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_type *ret_type; const struct btf_param *args; const struct btf *btf; u16 nr_args, i; int err; btf = env->btf; args = (const struct btf_param *)(t + 1); nr_args = btf_type_vlen(t); /* Check func return type which could be "void" (t->type == 0) */ if (t->type) { u32 ret_type_id = t->type; ret_type = btf_type_by_id(btf, ret_type_id); if (!ret_type) { btf_verifier_log_type(env, t, "Invalid return type"); return -EINVAL; } if (btf_type_is_resolve_source_only(ret_type)) { btf_verifier_log_type(env, t, "Invalid return type"); return -EINVAL; } if (btf_type_needs_resolve(ret_type) && !env_type_is_resolved(env, ret_type_id)) { err = btf_resolve(env, ret_type, ret_type_id); if (err) return err; } /* Ensure the return type is a type that has a size */ if (!btf_type_id_size(btf, &ret_type_id, NULL)) { btf_verifier_log_type(env, t, "Invalid return type"); return -EINVAL; } } if (!nr_args) return 0; /* Last func arg type_id could be 0 if it is a vararg */ if (!args[nr_args - 1].type) { if (args[nr_args - 1].name_off) { btf_verifier_log_type(env, t, "Invalid arg#%u", nr_args); return -EINVAL; } nr_args--; } for (i = 0; i < nr_args; i++) { const struct btf_type *arg_type; u32 arg_type_id; arg_type_id = args[i].type; arg_type = btf_type_by_id(btf, arg_type_id); if (!arg_type) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } if (btf_type_is_resolve_source_only(arg_type)) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } if (args[i].name_off && (!btf_name_offset_valid(btf, args[i].name_off) || !btf_name_valid_identifier(btf, args[i].name_off))) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } if (btf_type_needs_resolve(arg_type) && !env_type_is_resolved(env, arg_type_id)) { err = btf_resolve(env, arg_type, arg_type_id); if (err) return err; } if (!btf_type_id_size(btf, &arg_type_id, NULL)) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } } return 0; } static int btf_func_check(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_type *proto_type; const struct btf_param *args; const struct btf *btf; u16 nr_args, i; btf = env->btf; proto_type = btf_type_by_id(btf, t->type); if (!proto_type || !btf_type_is_func_proto(proto_type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } args = (const struct btf_param *)(proto_type + 1); nr_args = btf_type_vlen(proto_type); for (i = 0; i < nr_args; i++) { if (!args[i].name_off && args[i].type) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); return -EINVAL; } } return 0; } static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_INT] = &int_ops, [BTF_KIND_PTR] = &ptr_ops, [BTF_KIND_ARRAY] = &array_ops, [BTF_KIND_STRUCT] = &struct_ops, [BTF_KIND_UNION] = &struct_ops, [BTF_KIND_ENUM] = &enum_ops, [BTF_KIND_FWD] = &fwd_ops, [BTF_KIND_TYPEDEF] = &modifier_ops, [BTF_KIND_VOLATILE] = &modifier_ops, [BTF_KIND_CONST] = &modifier_ops, [BTF_KIND_RESTRICT] = &modifier_ops, [BTF_KIND_FUNC] = &func_ops, [BTF_KIND_FUNC_PROTO] = &func_proto_ops, [BTF_KIND_VAR] = &var_ops, [BTF_KIND_DATASEC] = &datasec_ops, [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, [BTF_KIND_TYPE_TAG] = &modifier_ops, [BTF_KIND_ENUM64] = &enum64_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { u32 saved_meta_left = meta_left; s32 var_meta_size; if (meta_left < sizeof(*t)) { btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu", env->log_type_id, meta_left, sizeof(*t)); return -EINVAL; } meta_left -= sizeof(*t); if (t->info & ~BTF_INFO_MASK) { btf_verifier_log(env, "[%u] Invalid btf_info:%x", env->log_type_id, t->info); return -EINVAL; } if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { btf_verifier_log(env, "[%u] Invalid kind:%u", env->log_type_id, BTF_INFO_KIND(t->info)); return -EINVAL; } if (!btf_name_offset_valid(env->btf, t->name_off)) { btf_verifier_log(env, "[%u] Invalid name_offset:%u", env->log_type_id, t->name_off); return -EINVAL; } var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left); if (var_meta_size < 0) return var_meta_size; meta_left -= var_meta_size; return saved_meta_left - meta_left; } static int btf_check_all_metas(struct btf_verifier_env *env) { struct btf *btf = env->btf; struct btf_header *hdr; void *cur, *end; hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; end = cur + hdr->type_len; env->log_type_id = btf->base_btf ? btf->start_id : 1; while (cur < end) { struct btf_type *t = cur; s32 meta_size; meta_size = btf_check_meta(env, t, end - cur); if (meta_size < 0) return meta_size; btf_add_type(env, t); cur += meta_size; env->log_type_id++; } return 0; } static bool btf_resolve_valid(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { struct btf *btf = env->btf; if (!env_type_is_resolved(env, type_id)) return false; if (btf_type_is_struct(t) || btf_type_is_datasec(t)) return !btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); if (btf_type_is_decl_tag(t) || btf_type_is_func(t)) return btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || btf_type_is_var(t)) { t = btf_type_id_resolve(btf, &type_id); return t && !btf_type_is_modifier(t) && !btf_type_is_var(t) && !btf_type_is_datasec(t); } if (btf_type_is_array(t)) { const struct btf_array *array = btf_type_array(t); const struct btf_type *elem_type; u32 elem_type_id = array->type; u32 elem_size; elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); return elem_type && !btf_type_is_modifier(elem_type) && (array->nelems * elem_size == btf_resolved_type_size(btf, type_id)); } return false; } static int btf_resolve(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { u32 save_log_type_id = env->log_type_id; const struct resolve_vertex *v; int err = 0; env->resolve_mode = RESOLVE_TBD; env_stack_push(env, t, type_id); while (!err && (v = env_stack_peak(env))) { env->log_type_id = v->type_id; err = btf_type_ops(v->t)->resolve(env, v); } env->log_type_id = type_id; if (err == -E2BIG) { btf_verifier_log_type(env, t, "Exceeded max resolving depth:%u", MAX_RESOLVE_DEPTH); } else if (err == -EEXIST) { btf_verifier_log_type(env, t, "Loop detected"); } /* Final sanity check */ if (!err && !btf_resolve_valid(env, t, type_id)) { btf_verifier_log_type(env, t, "Invalid resolve state"); err = -EINVAL; } env->log_type_id = save_log_type_id; return err; } static int btf_check_all_types(struct btf_verifier_env *env) { struct btf *btf = env->btf; const struct btf_type *t; u32 type_id, i; int err; err = env_resolve_init(env); if (err) return err; env->phase++; for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) { type_id = btf->start_id + i; t = btf_type_by_id(btf, type_id); env->log_type_id = type_id; if (btf_type_needs_resolve(t) && !env_type_is_resolved(env, type_id)) { err = btf_resolve(env, t, type_id); if (err) return err; } if (btf_type_is_func_proto(t)) { err = btf_func_proto_check(env, t); if (err) return err; } } return 0; } static int btf_parse_type_sec(struct btf_verifier_env *env) { const struct btf_header *hdr = &env->btf->hdr; int err; /* Type section must align to 4 bytes */ if (hdr->type_off & (sizeof(u32) - 1)) { btf_verifier_log(env, "Unaligned type_off"); return -EINVAL; } if (!env->btf->base_btf && !hdr->type_len) { btf_verifier_log(env, "No type found"); return -EINVAL; } err = btf_check_all_metas(env); if (err) return err; return btf_check_all_types(env); } static int btf_parse_str_sec(struct btf_verifier_env *env) { const struct btf_header *hdr; struct btf *btf = env->btf; const char *start, *end; hdr = &btf->hdr; start = btf->nohdr_data + hdr->str_off; end = start + hdr->str_len; if (end != btf->data + btf->data_size) { btf_verifier_log(env, "String section is not at the end"); return -EINVAL; } btf->strings = start; if (btf->base_btf && !hdr->str_len) return 0; if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) { btf_verifier_log(env, "Invalid string section"); return -EINVAL; } if (!btf->base_btf && start[0]) { btf_verifier_log(env, "Invalid string section"); return -EINVAL; } return 0; } static const size_t btf_sec_info_offset[] = { offsetof(struct btf_header, type_off), offsetof(struct btf_header, str_off), }; static int btf_sec_info_cmp(const void *a, const void *b) { const struct btf_sec_info *x = a; const struct btf_sec_info *y = b; return (int)(x->off - y->off) ? : (int)(x->len - y->len); } static int btf_check_sec_info(struct btf_verifier_env *env, u32 btf_data_size) { struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; u32 total, expected_total, i; const struct btf_header *hdr; const struct btf *btf; btf = env->btf; hdr = &btf->hdr; /* Populate the secs from hdr */ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) secs[i] = *(struct btf_sec_info *)((void *)hdr + btf_sec_info_offset[i]); sort(secs, ARRAY_SIZE(btf_sec_info_offset), sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); /* Check for gaps and overlap among sections */ total = 0; expected_total = btf_data_size - hdr->hdr_len; for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { if (expected_total < secs[i].off) { btf_verifier_log(env, "Invalid section offset"); return -EINVAL; } if (total < secs[i].off) { /* gap */ btf_verifier_log(env, "Unsupported section found"); return -EINVAL; } if (total > secs[i].off) { btf_verifier_log(env, "Section overlap found"); return -EINVAL; } if (expected_total - total < secs[i].len) { btf_verifier_log(env, "Total section length too long"); return -EINVAL; } total += secs[i].len; } /* There is data other than hdr and known sections */ if (expected_total != total) { btf_verifier_log(env, "Unsupported section found"); return -EINVAL; } return 0; } static int btf_parse_hdr(struct btf_verifier_env *env) { u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; struct btf *btf; btf = env->btf; btf_data_size = btf->data_size; if (btf_data_size < offsetofend(struct btf_header, hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } hdr = btf->data; hdr_len = hdr->hdr_len; if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } /* Ensure the unsupported header fields are zero */ if (hdr_len > sizeof(btf->hdr)) { u8 *expected_zero = btf->data + sizeof(btf->hdr); u8 *end = btf->data + hdr_len; for (; expected_zero < end; expected_zero++) { if (*expected_zero) { btf_verifier_log(env, "Unsupported btf_header"); return -E2BIG; } } } hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); memcpy(&btf->hdr, btf->data, hdr_copy); hdr = &btf->hdr; btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { btf_verifier_log(env, "Invalid magic"); return -EINVAL; } if (hdr->version != BTF_VERSION) { btf_verifier_log(env, "Unsupported version"); return -ENOTSUPP; } if (hdr->flags) { btf_verifier_log(env, "Unsupported flags"); return -ENOTSUPP; } if (!btf->base_btf && btf_data_size == hdr->hdr_len) { btf_verifier_log(env, "No data"); return -EINVAL; } return btf_check_sec_info(env, btf_data_size); } static const char *alloc_obj_fields[] = { "bpf_spin_lock", "bpf_list_head", "bpf_list_node", "bpf_rb_root", "bpf_rb_node", "bpf_refcount", }; static struct btf_struct_metas * btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) { struct btf_struct_metas *tab = NULL; struct btf_id_set *aof; int i, n, id, ret; BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0); BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32)); aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN); if (!aof) return ERR_PTR(-ENOMEM); aof->cnt = 0; for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) { /* Try to find whether this special type exists in user BTF, and * if so remember its ID so we can easily find it among members * of structs that we iterate in the next loop. */ struct btf_id_set *new_aof; id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT); if (id < 0) continue; new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; goto free_aof; } aof = new_aof; aof->ids[aof->cnt++] = id; } n = btf_nr_types(btf); for (i = 1; i < n; i++) { /* Try to find if there are kptrs in user BTF and remember their ID */ struct btf_id_set *new_aof; struct btf_field_info tmp; const struct btf_type *t; t = btf_type_by_id(btf, i); if (!t) { ret = -EINVAL; goto free_aof; } ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR); if (ret != BTF_FIELD_FOUND) continue; new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; goto free_aof; } aof = new_aof; aof->ids[aof->cnt++] = i; } if (!aof->cnt) { kfree(aof); return NULL; } sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL); for (i = 1; i < n; i++) { struct btf_struct_metas *new_tab; const struct btf_member *member; struct btf_struct_meta *type; struct btf_record *record; const struct btf_type *t; int j, tab_cnt; t = btf_type_by_id(btf, i); if (!__btf_type_is_struct(t)) continue; cond_resched(); for_each_member(j, t, member) { if (btf_id_set_contains(aof, member->type)) goto parse; } continue; parse: tab_cnt = tab ? tab->cnt : 0; new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_tab) { ret = -ENOMEM; goto free; } if (!tab) new_tab->cnt = 0; tab = new_tab; type = &tab->types[tab->cnt]; type->btf_id = i; record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | BPF_KPTR, t->size); /* The record cannot be unset, treat it as an error if so */ if (IS_ERR_OR_NULL(record)) { ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; goto free; } type->record = record; tab->cnt++; } kfree(aof); return tab; free: btf_struct_metas_free(tab); free_aof: kfree(aof); return ERR_PTR(ret); } struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id) { struct btf_struct_metas *tab; BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0); tab = btf->struct_meta_tab; if (!tab) return NULL; return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func); } static int btf_check_type_tags(struct btf_verifier_env *env, struct btf *btf, int start_id) { int i, n, good_id = start_id - 1; bool in_tags; n = btf_nr_types(btf); for (i = start_id; i < n; i++) { const struct btf_type *t; int chain_limit = 32; u32 cur_id = i; t = btf_type_by_id(btf, i); if (!t) return -EINVAL; if (!btf_type_is_modifier(t)) continue; cond_resched(); in_tags = btf_type_is_type_tag(t); while (btf_type_is_modifier(t)) { if (!chain_limit--) { btf_verifier_log(env, "Max chain length or cycle detected"); return -ELOOP; } if (btf_type_is_type_tag(t)) { if (!in_tags) { btf_verifier_log(env, "Type tags don't precede modifiers"); return -EINVAL; } } else if (in_tags) { in_tags = false; } if (cur_id <= good_id) break; /* Move to next type */ cur_id = t->type; t = btf_type_by_id(btf, cur_id); if (!t) return -EINVAL; } good_id = i; } return 0; } static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size) { u32 log_true_size; int err; err = bpf_vlog_finalize(log, &log_true_size); if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), &log_true_size, sizeof(log_true_size))) err = -EFAULT; return err; } static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf); struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; struct btf *btf = NULL; u8 *data; int err, ret; if (attr->btf_size > BTF_MAX_SIZE) return ERR_PTR(-E2BIG); env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ err = bpf_vlog_init(&env->log, attr->btf_log_level, log_ubuf, attr->btf_log_size); if (err) goto errout_free; btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { err = -ENOMEM; goto errout; } env->btf = btf; data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { err = -ENOMEM; goto errout; } btf->data = data; btf->data_size = attr->btf_size; if (copy_from_bpfptr(data, btf_data, attr->btf_size)) { err = -EFAULT; goto errout; } err = btf_parse_hdr(env); if (err) goto errout; btf->nohdr_data = btf->data + btf->hdr.hdr_len; err = btf_parse_str_sec(env); if (err) goto errout; err = btf_parse_type_sec(env); if (err) goto errout; err = btf_check_type_tags(env, btf, 1); if (err) goto errout; struct_meta_tab = btf_parse_struct_metas(&env->log, btf); if (IS_ERR(struct_meta_tab)) { err = PTR_ERR(struct_meta_tab); goto errout; } btf->struct_meta_tab = struct_meta_tab; if (struct_meta_tab) { int i; for (i = 0; i < struct_meta_tab->cnt; i++) { err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record); if (err < 0) goto errout_meta; } } err = finalize_log(&env->log, uattr, uattr_size); if (err) goto errout_free; btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; errout_meta: btf_free_struct_meta_tab(btf); errout: /* overwrite err with -ENOSPC or -EFAULT */ ret = finalize_log(&env->log, uattr, uattr_size); if (ret) err = ret; errout_free: btf_verifier_env_free(env); if (btf) btf_free(btf); return ERR_PTR(err); } extern char __start_BTF[]; extern char __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) #define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ prog_ctx_type _id##_prog; \ kern_ctx_type _id##_kern; #include <linux/bpf_types.h> #undef BPF_PROG_TYPE } *__t; /* 't' is written once under lock. Read many times. */ const struct btf_type *t; } bpf_ctx_convert; enum { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ __ctx_convert##_id, #include <linux/bpf_types.h> #undef BPF_PROG_TYPE __ctx_convert_unused, /* to avoid empty enum in extreme .config */ }; static u8 bpf_ctx_convert_map[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = __ctx_convert##_id, #include <linux/bpf_types.h> #undef BPF_PROG_TYPE 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type) { const struct btf_type *conv_struct; const struct btf_member *ctx_type; conv_struct = bpf_ctx_convert.t; if (!conv_struct) return NULL; /* prog_type is valid bpf program type. No need for bounds check. */ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; /* ctx_type is a pointer to prog_ctx_type in vmlinux. * Like 'struct __sk_buff' */ return btf_type_by_id(btf_vmlinux, ctx_type->type); } static int find_kern_ctx_type_id(enum bpf_prog_type prog_type) { const struct btf_type *conv_struct; const struct btf_member *ctx_type; conv_struct = bpf_ctx_convert.t; if (!conv_struct) return -EFAULT; /* prog_type is valid bpf program type. No need for bounds check. */ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; /* ctx_type is a pointer to prog_ctx_type in vmlinux. * Like 'struct sk_buff' */ return ctx_type->type; } bool btf_is_projection_of(const char *pname, const char *tname) { if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) return true; if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) return true; return false; } bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) { const struct btf_type *ctx_type; const char *tname, *ctx_tname; t = btf_type_by_id(btf, t->type); /* KPROBE programs allow bpf_user_pt_regs_t typedef, which we need to * check before we skip all the typedef below. */ if (prog_type == BPF_PROG_TYPE_KPROBE) { while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_typedef(t)) { tname = btf_name_by_offset(btf, t->name_off); if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) return true; } } while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_struct(t)) { /* Only pointer to struct is supported for now. * That means that BPF_PROG_TYPE_TRACEPOINT with BTF * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ return false; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return false; } ctx_type = find_canonical_prog_ctx_type(prog_type); if (!ctx_type) { bpf_log(log, "btf_vmlinux is malformed\n"); /* should not happen */ return false; } again: ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!ctx_tname) { /* should not happen */ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); return false; } /* program types without named context types work only with arg:ctx tag */ if (ctx_tname[0] == '\0') return false; /* only compare that prog's ctx type name is the same as * kernel expects. No need to compare field by field. * It's ok for bpf prog to do: * struct __sk_buff {}; * int socket_filter_bpf_prog(struct __sk_buff *skb) * { // no fields of skb are ever used } */ if (btf_is_projection_of(ctx_tname, tname)) return true; if (strcmp(ctx_tname, tname)) { /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again */ if (!btf_type_is_modifier(ctx_type)) return false; while (btf_type_is_modifier(ctx_type)) ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); goto again; } return true; } /* forward declarations for arch-specific underlying types of * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still * works correctly with __builtin_types_compatible_p() on respective * architectures */ struct user_regs_struct; struct user_pt_regs; static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int arg, enum bpf_prog_type prog_type, enum bpf_attach_type attach_type) { const struct btf_type *ctx_type; const char *tname, *ctx_tname; if (!btf_is_ptr(t)) { bpf_log(log, "arg#%d type isn't a pointer\n", arg); return -EINVAL; } t = btf_type_by_id(btf, t->type); /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */ if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) { while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_typedef(t)) { tname = btf_name_by_offset(btf, t->name_off); if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) return 0; } } /* all other program types don't use typedefs for context type */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); /* `void *ctx __arg_ctx` is always valid */ if (btf_type_is_void(t)) return 0; tname = btf_name_by_offset(btf, t->name_off); if (str_is_empty(tname)) { bpf_log(log, "arg#%d type doesn't have a name\n", arg); return -EINVAL; } /* special cases */ switch (prog_type) { case BPF_PROG_TYPE_KPROBE: if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) return 0; break; case BPF_PROG_TYPE_PERF_EVENT: if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) && __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) return 0; if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) && __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0) return 0; if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) && __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0) return 0; break; case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; break; case BPF_PROG_TYPE_TRACING: switch (attach_type) { case BPF_TRACE_RAW_TP: /* tp_btf program is TRACING, so need special case here */ if (__btf_type_is_struct(t) && strcmp(tname, "bpf_raw_tracepoint_args") == 0) return 0; /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; break; case BPF_TRACE_ITER: /* allow struct bpf_iter__xxx types only */ if (__btf_type_is_struct(t) && strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0) return 0; break; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; break; default: break; } break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; break; case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: return 0; /* anything goes */ default: break; } ctx_type = find_canonical_prog_ctx_type(prog_type); if (!ctx_type) { /* should not happen */ bpf_log(log, "btf_vmlinux is malformed\n"); return -EINVAL; } /* resolve typedefs and check that underlying structs are matching as well */ while (btf_type_is_modifier(ctx_type)) ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); /* if program type doesn't have distinctly named struct type for * context, then __arg_ctx argument can only be `void *`, which we * already checked above */ if (!__btf_type_is_struct(ctx_type)) { bpf_log(log, "arg#%d should be void pointer\n", arg); return -EINVAL; } ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) { bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname); return -EINVAL; } return 0; } static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) { if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg)) return -ENOENT; return find_kern_ctx_type_id(prog_type); } int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) { const struct btf_member *kctx_member; const struct btf_type *conv_struct; const struct btf_type *kctx_type; u32 kctx_type_id; conv_struct = bpf_ctx_convert.t; /* get member for kernel ctx type */ kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; kctx_type_id = kctx_member->type; kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id); if (!btf_type_is_struct(kctx_type)) { bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id); return -EINVAL; } return kctx_type_id; } BTF_ID_LIST_SINGLE(bpf_ctx_convert_btf_id, struct, bpf_ctx_convert) static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name, void *data, unsigned int data_size) { struct btf *btf = NULL; int err; if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) return ERR_PTR(-ENOENT); btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { err = -ENOMEM; goto errout; } env->btf = btf; btf->data = data; btf->data_size = data_size; btf->kernel_btf = true; snprintf(btf->name, sizeof(btf->name), "%s", name); err = btf_parse_hdr(env); if (err) goto errout; btf->nohdr_data = btf->data + btf->hdr.hdr_len; err = btf_parse_str_sec(env); if (err) goto errout; err = btf_check_all_metas(env); if (err) goto errout; err = btf_check_type_tags(env, btf, 1); if (err) goto errout; refcount_set(&btf->refcnt, 1); return btf; errout: if (btf) { kvfree(btf->types); kfree(btf); } return ERR_PTR(err); } struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf; int err; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); log = &env->log; log->level = BPF_LOG_KERNEL; btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF); if (IS_ERR(btf)) goto err_out; /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); err = btf_alloc_id(btf); if (err) { btf_free(btf); btf = ERR_PTR(err); } err_out: btf_verifier_env_free(env); return btf; } /* If .BTF_ids section was created with distilled base BTF, both base and * split BTF ids will need to be mapped to actual base/split ids for * BTF now that it has been relocated. */ static __u32 btf_relocate_id(const struct btf *btf, __u32 id) { if (!btf->base_btf || !btf->base_id_map) return id; return btf->base_id_map[id]; } #ifdef CONFIG_DEBUG_INFO_BTF_MODULES static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size, void *base_data, unsigned int base_data_size) { struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL; struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; int err = 0; vmlinux_btf = bpf_get_btf_vmlinux(); if (IS_ERR(vmlinux_btf)) return vmlinux_btf; if (!vmlinux_btf) return ERR_PTR(-EINVAL); env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); log = &env->log; log->level = BPF_LOG_KERNEL; if (base_data) { base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size); if (IS_ERR(base_btf)) { err = PTR_ERR(base_btf); goto errout; } } else { base_btf = vmlinux_btf; } btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { err = -ENOMEM; goto errout; } env->btf = btf; btf->base_btf = base_btf; btf->start_id = base_btf->nr_types; btf->start_str_off = base_btf->hdr.str_len; btf->kernel_btf = true; snprintf(btf->name, sizeof(btf->name), "%s", module_name); btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); if (!btf->data) { err = -ENOMEM; goto errout; } btf->data_size = data_size; err = btf_parse_hdr(env); if (err) goto errout; btf->nohdr_data = btf->data + btf->hdr.hdr_len; err = btf_parse_str_sec(env); if (err) goto errout; err = btf_check_all_metas(env); if (err) goto errout; err = btf_check_type_tags(env, btf, btf_nr_types(base_btf)); if (err) goto errout; if (base_btf != vmlinux_btf) { err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map); if (err) goto errout; btf_free(base_btf); base_btf = vmlinux_btf; } btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; errout: btf_verifier_env_free(env); if (!IS_ERR(base_btf) && base_btf != vmlinux_btf) btf_free(base_btf); if (btf) { kvfree(btf->data); kvfree(btf->types); kfree(btf); } return ERR_PTR(err); } #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; if (tgt_prog) return tgt_prog->aux->btf; else return prog->aux->attach_btf; } static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t) { /* skip modifiers */ t = btf_type_skip_modifiers(btf, t->type, NULL); return btf_type_is_void(t) || btf_type_is_int(t); } u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off) { const struct btf_param *args; const struct btf_type *t; u32 offset = 0, nr_args; int i; if (!func_proto) return off / 8; nr_args = btf_type_vlen(func_proto); args = (const struct btf_param *)(func_proto + 1); for (i = 0; i < nr_args; i++) { t = btf_type_skip_modifiers(btf, args[i].type, NULL); offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); if (off < offset) return i; } t = btf_type_skip_modifiers(btf, func_proto->type, NULL); offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); if (off < offset) return nr_args; return nr_args + 1; } static bool prog_args_trusted(const struct bpf_prog *prog) { enum bpf_attach_type atype = prog->expected_attach_type; switch (prog->type) { case BPF_PROG_TYPE_TRACING: return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER; case BPF_PROG_TYPE_LSM: return bpf_lsm_is_trusted(prog); case BPF_PROG_TYPE_STRUCT_OPS: return true; default: return false; } } int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, u32 arg_no) { const struct btf_param *args; const struct btf_type *t; int off = 0, i; u32 sz; args = btf_params(func_proto); for (i = 0; i < arg_no; i++) { t = btf_type_by_id(btf, args[i].type); t = btf_resolve_size(btf, t, &sz); if (IS_ERR(t)) return PTR_ERR(t); off += roundup(sz, 8); } return off; } struct bpf_raw_tp_null_args { const char *func; u64 mask; }; static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { /* sched */ { "sched_pi_setprio", 0x10 }, /* ... from sched_numa_pair_template event class */ { "sched_stick_numa", 0x100 }, { "sched_swap_numa", 0x100 }, /* afs */ { "afs_make_fs_call", 0x10 }, { "afs_make_fs_calli", 0x10 }, { "afs_make_fs_call1", 0x10 }, { "afs_make_fs_call2", 0x10 }, { "afs_protocol_error", 0x1 }, { "afs_flock_ev", 0x10 }, /* cachefiles */ { "cachefiles_lookup", 0x1 | 0x200 }, { "cachefiles_unlink", 0x1 }, { "cachefiles_rename", 0x1 }, { "cachefiles_prep_read", 0x1 }, { "cachefiles_mark_active", 0x1 }, { "cachefiles_mark_failed", 0x1 }, { "cachefiles_mark_inactive", 0x1 }, { "cachefiles_vfs_error", 0x1 }, { "cachefiles_io_error", 0x1 }, { "cachefiles_ondemand_open", 0x1 }, { "cachefiles_ondemand_copen", 0x1 }, { "cachefiles_ondemand_close", 0x1 }, { "cachefiles_ondemand_read", 0x1 }, { "cachefiles_ondemand_cread", 0x1 }, { "cachefiles_ondemand_fd_write", 0x1 }, { "cachefiles_ondemand_fd_release", 0x1 }, /* ext4, from ext4__mballoc event class */ { "ext4_mballoc_discard", 0x10 }, { "ext4_mballoc_free", 0x10 }, /* fib */ { "fib_table_lookup", 0x100 }, /* filelock */ /* ... from filelock_lock event class */ { "posix_lock_inode", 0x10 }, { "fcntl_setlk", 0x10 }, { "locks_remove_posix", 0x10 }, { "flock_lock_inode", 0x10 }, /* ... from filelock_lease event class */ { "break_lease_noblock", 0x10 }, { "break_lease_block", 0x10 }, { "break_lease_unblock", 0x10 }, { "generic_delete_lease", 0x10 }, { "time_out_leases", 0x10 }, /* host1x */ { "host1x_cdma_push_gather", 0x10000 }, /* huge_memory */ { "mm_khugepaged_scan_pmd", 0x10 }, { "mm_collapse_huge_page_isolate", 0x1 }, { "mm_khugepaged_scan_file", 0x10 }, { "mm_khugepaged_collapse_file", 0x10 }, /* kmem */ { "mm_page_alloc", 0x1 }, { "mm_page_pcpu_drain", 0x1 }, /* .. from mm_page event class */ { "mm_page_alloc_zone_locked", 0x1 }, /* netfs */ { "netfs_failure", 0x10 }, /* power */ { "device_pm_callback_start", 0x10 }, /* qdisc */ { "qdisc_dequeue", 0x1000 }, /* rxrpc */ { "rxrpc_recvdata", 0x1 }, { "rxrpc_resend", 0x10 }, { "rxrpc_tq", 0x10 }, { "rxrpc_client", 0x1 }, /* skb */ {"kfree_skb", 0x1000}, /* sunrpc */ { "xs_stream_read_data", 0x1 }, /* ... from xprt_cong_event event class */ { "xprt_reserve_cong", 0x10 }, { "xprt_release_cong", 0x10 }, { "xprt_get_cong", 0x10 }, { "xprt_put_cong", 0x10 }, /* tcp */ { "tcp_send_reset", 0x11 }, { "tcp_sendmsg_locked", 0x100 }, /* tegra_apb_dma */ { "tegra_dma_tx_status", 0x100 }, /* timer_migration */ { "tmigr_update_events", 0x1 }, /* writeback, from writeback_folio_template event class */ { "writeback_dirty_folio", 0x10 }, { "folio_wait_writeback", 0x10 }, /* rdma */ { "mr_integ_alloc", 0x2000 }, /* bpf_testmod */ { "bpf_testmod_test_read", 0x0 }, /* amdgpu */ { "amdgpu_vm_bo_map", 0x1 }, { "amdgpu_vm_bo_unmap", 0x1 }, /* netfs */ { "netfs_folioq", 0x1 }, /* xfs from xfs_defer_pending_class */ { "xfs_defer_create_intent", 0x1 }, { "xfs_defer_cancel_list", 0x1 }, { "xfs_defer_pending_finish", 0x1 }, { "xfs_defer_pending_abort", 0x1 }, { "xfs_defer_relog_intent", 0x1 }, { "xfs_defer_isolate_paused", 0x1 }, { "xfs_defer_item_pause", 0x1 }, { "xfs_defer_item_unpause", 0x1 }, /* xfs from xfs_defer_pending_item_class */ { "xfs_defer_add_item", 0x1 }, { "xfs_defer_cancel_item", 0x1 }, { "xfs_defer_finish_item", 0x1 }, /* xfs from xfs_icwalk_class */ { "xfs_ioc_free_eofblocks", 0x10 }, { "xfs_blockgc_free_space", 0x10 }, /* xfs from xfs_btree_cur_class */ { "xfs_btree_updkeys", 0x100 }, { "xfs_btree_overlapped_query_range", 0x100 }, /* xfs from xfs_imap_class*/ { "xfs_map_blocks_found", 0x10000 }, { "xfs_map_blocks_alloc", 0x10000 }, { "xfs_iomap_alloc", 0x1000 }, { "xfs_iomap_found", 0x1000 }, /* xfs from xfs_fs_class */ { "xfs_inodegc_flush", 0x1 }, { "xfs_inodegc_push", 0x1 }, { "xfs_inodegc_start", 0x1 }, { "xfs_inodegc_stop", 0x1 }, { "xfs_inodegc_queue", 0x1 }, { "xfs_inodegc_throttle", 0x1 }, { "xfs_fs_sync_fs", 0x1 }, { "xfs_blockgc_start", 0x1 }, { "xfs_blockgc_stop", 0x1 }, { "xfs_blockgc_worker", 0x1 }, { "xfs_blockgc_flush_all", 0x1 }, /* xfs_scrub */ { "xchk_nlinks_live_update", 0x10 }, /* xfs_scrub from xchk_metapath_class */ { "xchk_metapath_lookup", 0x100 }, /* nfsd */ { "nfsd_dirent", 0x1 }, { "nfsd_file_acquire", 0x1001 }, { "nfsd_file_insert_err", 0x1 }, { "nfsd_file_cons_err", 0x1 }, /* nfs4 */ { "nfs4_setup_sequence", 0x1 }, { "pnfs_update_layout", 0x10000 }, { "nfs4_inode_callback_event", 0x200 }, { "nfs4_inode_stateid_callback_event", 0x200 }, /* nfs from pnfs_layout_event */ { "pnfs_mds_fallback_pg_init_read", 0x10000 }, { "pnfs_mds_fallback_pg_init_write", 0x10000 }, { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 }, { "pnfs_mds_fallback_read_done", 0x10000 }, { "pnfs_mds_fallback_write_done", 0x10000 }, { "pnfs_mds_fallback_read_pagelist", 0x10000 }, { "pnfs_mds_fallback_write_pagelist", 0x10000 }, /* coda */ { "coda_dec_pic_run", 0x10 }, { "coda_dec_pic_done", 0x10 }, /* cfg80211 */ { "cfg80211_scan_done", 0x11 }, { "rdev_set_coalesce", 0x10 }, { "cfg80211_report_wowlan_wakeup", 0x100 }, { "cfg80211_inform_bss_frame", 0x100 }, { "cfg80211_michael_mic_failure", 0x10000 }, /* cfg80211 from wiphy_work_event */ { "wiphy_work_queue", 0x10 }, { "wiphy_work_run", 0x10 }, { "wiphy_work_cancel", 0x10 }, { "wiphy_work_flush", 0x10 }, /* hugetlbfs */ { "hugetlbfs_alloc_inode", 0x10 }, /* spufs */ { "spufs_context", 0x10 }, /* kvm_hv */ { "kvm_page_fault_enter", 0x100 }, /* dpu */ { "dpu_crtc_setup_mixer", 0x100 }, /* binder */ { "binder_transaction", 0x100 }, /* bcachefs */ { "btree_path_free", 0x100 }, /* hfi1_tx */ { "hfi1_sdma_progress", 0x1000 }, /* iptfs */ { "iptfs_ingress_postq_event", 0x1000 }, /* neigh */ { "neigh_update", 0x10 }, /* snd_firewire_lib */ { "amdtp_packet", 0x100 }, }; bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const struct btf_type *t = prog->aux->attach_func_proto; struct bpf_prog *tgt_prog = prog->aux->dst_prog; struct btf *btf = bpf_prog_get_target_btf(prog); const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; bool ptr_err_raw_tp = false; const char *tag_value; u32 nr_args, arg; int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", tname, off); return false; } arg = btf_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. */ nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; nr_args--; } if (arg > nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); return false; } if (arg == nr_args) { switch (prog->expected_attach_type) { case BPF_LSM_MAC: /* mark we are accessing the return value */ info->is_retval = true; fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. * * While the LSM programs are BPF_MODIFY_RETURN-like * the check: * * if (ret_type != 'int') * return -EINVAL; * * is _not_ done here. This is still safe as LSM hooks * have only void and int return types. */ if (!t) return true; t = btf_type_by_id(btf, t->type); break; case BPF_MODIFY_RETURN: /* For now the BPF_MODIFY_RETURN can only be attached to * functions that return an int. */ if (!t) return false; t = btf_type_skip_modifiers(btf, t->type, NULL); if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", btf_type_str(t)); return false; } break; default: bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); return false; } } else { if (!t) /* Default prog with MAX_BPF_FUNC_REG_ARGS args */ return true; t = btf_type_by_id(btf, args[arg].type); } /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { bpf_log(log, "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, __btf_name_by_offset(btf, t->name_off), btf_type_str(t)); return false; } if (size != sizeof(u64)) { bpf_log(log, "func '%s' size %d must be 8\n", tname, size); return false; } /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; u32 type, flag; type = base_type(ctx_arg_info->reg_type); flag = type_flag(ctx_arg_info->reg_type); if (ctx_arg_info->offset == off && type == PTR_TO_BUF && (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; } } /* * If it's a pointer to void, it's the same as scalar from the verifier * safety POV. Either way, no futher pointer walking is allowed. */ if (is_void_or_int_ptr(btf, t)) return true; /* this is a pointer to another type */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; if (ctx_arg_info->offset == off) { if (!ctx_arg_info->btf_id) { bpf_log(log,"invalid btf_id for context argument offset %u\n", off); return false; } info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; info->ref_obj_id = ctx_arg_info->ref_obj_id; return true; } } info->reg_type = PTR_TO_BTF_ID; if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { struct btf *btf = prog->aux->attach_btf; const struct btf_type *t; const char *tname; /* BTF lookups cannot fail, return false on error */ t = btf_type_by_id(btf, prog->aux->attach_btf_id); if (!t) return false; tname = btf_name_by_offset(btf, t->name_off); if (!tname) return false; /* Checked by bpf_check_attach_target */ tname += sizeof("btf_trace_") - 1; for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) { /* Is this a func with potential NULL args? */ if (strcmp(tname, raw_tp_null_args[i].func)) continue; if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4))) info->reg_type |= PTR_MAYBE_NULL; /* Is the current arg IS_ERR? */ if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4))) ptr_err_raw_tp = true; break; } /* If we don't know NULL-ness specification and the tracepoint * is coming from a loadable module, be conservative and mark * argument as PTR_MAYBE_NULL. */ if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf)) info->reg_type |= PTR_MAYBE_NULL; } if (tgt_prog) { enum bpf_prog_type tgt_type; if (tgt_prog->type == BPF_PROG_TYPE_EXT) tgt_type = tgt_prog->aux->saved_dst_prog_type; else tgt_type = tgt_prog->type; ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg); if (ret > 0) { info->btf = btf_vmlinux; info->btf_id = ret; return true; } else { return false; } } info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag_value, "user") == 0) info->reg_type |= MEM_USER; if (strcmp(tag_value, "percpu") == 0) info->reg_type |= MEM_PERCPU; } /* skip modifiers */ while (btf_type_is_modifier(t)) { info->btf_id = t->type; t = btf_type_by_id(btf, t->type); } if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", tname, arg, btf_type_str(t)); return false; } bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); /* Perform all checks on the validity of type for this argument, but if * we know it can be IS_ERR at runtime, scrub pointer type and mark as * scalar. */ if (ptr_err_raw_tp) { bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg); info->reg_type = SCALAR_VALUE; } return true; } EXPORT_SYMBOL_GPL(btf_ctx_access); enum bpf_struct_walk_result { /* < 0 error */ WALK_SCALAR = 0, WALK_PTR, WALK_PTR_UNTRUSTED, WALK_STRUCT, }; static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; again: if (btf_type_is_modifier(t)) t = btf_type_skip_modifiers(btf, t->type, NULL); tname = __btf_name_by_offset(btf, t->name_off); if (!btf_type_is_struct(t)) { bpf_log(log, "Type '%s' is not a struct\n", tname); return -EINVAL; } vlen = btf_type_vlen(t); if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED)) /* * walking unions yields untrusted pointers * with exception of __bpf_md_ptr and other * unions with a single member */ *flag |= PTR_UNTRUSTED; if (off + size > t->size) { /* If the last element is a variable size array, we may * need to relax the rule. */ struct btf_array *array_elem; if (vlen == 0) goto error; member = btf_type_member(t) + vlen - 1; mtype = btf_type_skip_modifiers(btf, member->type, NULL); if (!btf_type_is_array(mtype)) goto error; array_elem = (struct btf_array *)(mtype + 1); if (array_elem->nelems != 0) goto error; moff = __btf_member_bit_offset(t, member) / 8; if (off < moff) goto error; /* allow structure and integer */ t = btf_type_skip_modifiers(btf, array_elem->type, NULL); if (btf_type_is_int(t)) return WALK_SCALAR; if (!btf_type_is_struct(t)) goto error; off = (off - moff) % t->size; goto again; error: bpf_log(log, "access beyond struct %s at off %u size %u\n", tname, off, size); return -EACCES; } for_each_member(i, t, member) { /* offset of the field in bytes */ moff = __btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; if (__btf_member_bitfield_size(t, member)) { u32 end_bit = __btf_member_bit_offset(t, member) + __btf_member_bitfield_size(t, member); /* off <= moff instead of off == moff because clang * does not generate a BTF member for anonymous * bitfield like the ":16" here: * struct { * int :16; * int x:8; * }; */ if (off <= moff && BITS_ROUNDUP_BYTES(end_bit) <= off + size) return WALK_SCALAR; /* off may be accessing a following member * * or * * Doing partial access at either end of this * bitfield. Continue on this case also to * treat it as not accessing this bitfield * and eventually error out as field not * found to keep it simple. * It could be relaxed if there was a legit * partial access case later. */ continue; } /* In case of "off" is pointing to holes of a struct */ if (off < moff) break; /* type of the field */ mid = member->type; mtype = btf_type_by_id(btf, member->type); mname = __btf_name_by_offset(btf, member->name_off); mtype = __btf_resolve_size(btf, mtype, &msize, &elem_type, &elem_id, &total_nelems, &mid); if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; } mtrue_end = moff + msize; if (off >= mtrue_end) /* no overlap with member, keep iterating */ continue; if (btf_type_is_array(mtype)) { u32 elem_idx; /* __btf_resolve_size() above helps to * linearize a multi-dimensional array. * * The logic here is treating an array * in a struct as the following way: * * struct outer { * struct inner array[2][2]; * }; * * looks like: * * struct outer { * struct inner array_elem0; * struct inner array_elem1; * struct inner array_elem2; * struct inner array_elem3; * }; * * When accessing outer->array[1][0], it moves * moff to "array_elem2", set mtype to * "struct inner", and msize also becomes * sizeof(struct inner). Then most of the * remaining logic will fall through without * caring the current member is an array or * not. * * Unlike mtype/msize/moff, mtrue_end does not * change. The naming difference ("_true") tells * that it is not always corresponding to * the current mtype/msize/moff. * It is the true end of the current * member (i.e. array in this case). That * will allow an int array to be accessed like * a scratch space, * i.e. allow access beyond the size of * the array's element as long as it is * within the mtrue_end boundary. */ /* skip empty array */ if (moff == mtrue_end) continue; msize /= total_nelems; elem_idx = (off - moff) / msize; moff += elem_idx * msize; mtype = elem_type; mid = elem_id; } /* the 'off' we're looking for is either equal to start * of this field or inside of this struct */ if (btf_type_is_struct(mtype)) { /* our field must be inside that union or struct */ t = mtype; /* return if the offset matches the member offset */ if (off == moff) { *next_btf_id = mid; return WALK_STRUCT; } /* adjust offset we're looking for */ off -= moff; goto again; } if (btf_type_is_ptr(mtype)) { const struct btf_type *stype, *t; enum bpf_type_flag tmp_flag = 0; u32 id; if (msize != size || off != moff) { bpf_log(log, "cannot access ptr member %s with moff %u in struct %s with off %u size %u\n", mname, moff, tname, off, size); return -EACCES; } /* check type tag */ t = btf_type_by_id(btf, mtype->type); if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); /* check __user tag */ if (strcmp(tag_value, "user") == 0) tmp_flag = MEM_USER; /* check __percpu tag */ if (strcmp(tag_value, "percpu") == 0) tmp_flag = MEM_PERCPU; /* check __rcu tag */ if (strcmp(tag_value, "rcu") == 0) tmp_flag = MEM_RCU; } stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; *flag |= tmp_flag; if (field_name) *field_name = mname; return WALK_PTR; } return WALK_PTR_UNTRUSTED; } /* Allow more flexible access within an int as long as * it is within mtrue_end. * Since mtrue_end could be the end of an array, * that also allows using an array of int as a scratch * space. e.g. skb->cb[]. */ if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) { bpf_log(log, "access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n", mname, mtrue_end, tname, off, size); return -EACCES; } return WALK_SCALAR; } bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off); return -EINVAL; } int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype __maybe_unused, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name) { const struct btf *btf = reg->btf; enum bpf_type_flag tmp_flag = 0; const struct btf_type *t; u32 id = reg->btf_id; int err; while (type_is_alloc(reg->type)) { struct btf_struct_meta *meta; struct btf_record *rec; int i; meta = btf_find_struct_meta(btf, id); if (!meta) break; rec = meta->record; for (i = 0; i < rec->cnt; i++) { struct btf_field *field = &rec->fields[i]; u32 offset = field->offset; if (off < offset + field->size && offset < off + size) { bpf_log(log, "direct access to %s is disallowed\n", btf_field_type_name(field->type)); return -EACCES; } } break; } t = btf_type_by_id(btf, id); do { err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name); switch (err) { case WALK_PTR: /* For local types, the destination register cannot * become a pointer again. */ if (type_is_alloc(reg->type)) return SCALAR_VALUE; /* If we found the pointer or scalar on t+off, * we're done. */ *next_btf_id = id; *flag = tmp_flag; return PTR_TO_BTF_ID; case WALK_PTR_UNTRUSTED: *flag = MEM_RDONLY | PTR_UNTRUSTED; return PTR_TO_MEM; case WALK_SCALAR: return SCALAR_VALUE; case WALK_STRUCT: /* We found nested struct, so continue the search * by diving in it. At this point the offset is * aligned with the new type, so set it to 0. */ t = btf_type_by_id(btf, id); off = 0; break; default: /* It's either error or unknown return value.. * scream and leave. */ if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value")) return -EINVAL; return err; } } while (t); return -EINVAL; } /* Check that two BTF types, each specified as an BTF object + id, are exactly * the same. Trivial ID check is not enough due to module BTFs, because we can * end up with two different module BTFs, but IDs point to the common type in * vmlinux BTF. */ bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2) { if (id1 != id2) return false; if (btf1 == btf2) return true; return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2); } bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id, bool strict) { const struct btf_type *type; enum bpf_type_flag flag = 0; int err; /* Are we already done? */ if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) return true; /* In case of strict type match, we do not walk struct, the top level * type match must succeed. When strict is true, off should have already * been 0. */ if (strict) return false; again: type = btf_type_by_id(btf, id); if (!type) return false; err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL); if (err != WALK_STRUCT) return false; /* We found nested struct object. If it matches * the requested ID, we're done. Otherwise let's * continue the search with offset 0 in the new * type. */ if (!btf_types_are_same(btf, id, need_btf, need_type_id)) { off = 0; goto again; } return true; } static int __get_type_size(struct btf *btf, u32 btf_id, const struct btf_type **ret_type) { const struct btf_type *t; *ret_type = btf_type_by_id(btf, 0); if (!btf_id) /* void */ return 0; t = btf_type_by_id(btf, btf_id); while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!t) return -EINVAL; *ret_type = t; if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) return t->size; return -EINVAL; } static u8 __get_type_fmodel_flags(const struct btf_type *t) { u8 flags = 0; if (__btf_type_is_struct(t)) flags |= BTF_FMODEL_STRUCT_ARG; if (btf_type_is_signed_int(t)) flags |= BTF_FMODEL_SIGNED_ARG; return flags; } int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *func, const char *tname, struct btf_func_model *m) { const struct btf_param *args; const struct btf_type *t; u32 i, nargs; int ret; if (!func) { /* BTF function prototype doesn't match the verifier types. * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { m->arg_size[i] = 8; m->arg_flags[i] = 0; } m->ret_size = 8; m->ret_flags = 0; m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; } args = (const struct btf_param *)(func + 1); nargs = btf_type_vlen(func); if (nargs > MAX_BPF_FUNC_ARGS) { bpf_log(log, "The function %s has %d arguments. Too many.\n", tname, nargs); return -EINVAL; } ret = __get_type_size(btf, func->type, &t); if (ret < 0 || __btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", tname, btf_type_str(t)); return -EINVAL; } m->ret_size = ret; m->ret_flags = __get_type_fmodel_flags(t); for (i = 0; i < nargs; i++) { if (i == nargs - 1 && args[i].type == 0) { bpf_log(log, "The function %s with variable args is unsupported.\n", tname); return -EINVAL; } ret = __get_type_size(btf, args[i].type, &t); /* No support of struct argument size greater than 16 bytes */ if (ret < 0 || ret > 16) { bpf_log(log, "The function %s arg%d type %s is unsupported.\n", tname, i, btf_type_str(t)); return -EINVAL; } if (ret == 0) { bpf_log(log, "The function %s has malformed void argument.\n", tname); return -EINVAL; } m->arg_size[i] = ret; m->arg_flags[i] = __get_type_fmodel_flags(t); } m->nr_args = nargs; return 0; } /* Compare BTFs of two functions assuming only scalars and pointers to context. * t1 points to BTF_KIND_FUNC in btf1 * t2 points to BTF_KIND_FUNC in btf2 * Returns: * EINVAL - function prototype mismatch * EFAULT - verifier bug * 0 - 99% match. The last 1% is validated by the verifier. */ static int btf_check_func_type_match(struct bpf_verifier_log *log, struct btf *btf1, const struct btf_type *t1, struct btf *btf2, const struct btf_type *t2) { const struct btf_param *args1, *args2; const char *fn1, *fn2, *s1, *s2; u32 nargs1, nargs2, i; fn1 = btf_name_by_offset(btf1, t1->name_off); fn2 = btf_name_by_offset(btf2, t2->name_off); if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) { bpf_log(log, "%s() is not a global function\n", fn1); return -EINVAL; } if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) { bpf_log(log, "%s() is not a global function\n", fn2); return -EINVAL; } t1 = btf_type_by_id(btf1, t1->type); if (!t1 || !btf_type_is_func_proto(t1)) return -EFAULT; t2 = btf_type_by_id(btf2, t2->type); if (!t2 || !btf_type_is_func_proto(t2)) return -EFAULT; args1 = (const struct btf_param *)(t1 + 1); nargs1 = btf_type_vlen(t1); args2 = (const struct btf_param *)(t2 + 1); nargs2 = btf_type_vlen(t2); if (nargs1 != nargs2) { bpf_log(log, "%s() has %d args while %s() has %d args\n", fn1, nargs1, fn2, nargs2); return -EINVAL; } t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); if (t1->info != t2->info) { bpf_log(log, "Return type %s of %s() doesn't match type %s of %s()\n", btf_type_str(t1), fn1, btf_type_str(t2), fn2); return -EINVAL; } for (i = 0; i < nargs1; i++) { t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL); t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL); if (t1->info != t2->info) { bpf_log(log, "arg%d in %s() is %s while %s() has %s\n", i, fn1, btf_type_str(t1), fn2, btf_type_str(t2)); return -EINVAL; } if (btf_type_has_size(t1) && t1->size != t2->size) { bpf_log(log, "arg%d in %s() has size %d while %s() has %d\n", i, fn1, t1->size, fn2, t2->size); return -EINVAL; } /* global functions are validated with scalars and pointers * to context only. And only global functions can be replaced. * Hence type check only those types. */ if (btf_type_is_int(t1) || btf_is_any_enum(t1)) continue; if (!btf_type_is_ptr(t1)) { bpf_log(log, "arg%d in %s() has unrecognized type\n", i, fn1); return -EINVAL; } t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); if (!btf_type_is_struct(t1)) { bpf_log(log, "arg%d in %s() is not a pointer to context\n", i, fn1); return -EINVAL; } if (!btf_type_is_struct(t2)) { bpf_log(log, "arg%d in %s() is not a pointer to context\n", i, fn2); return -EINVAL; } /* This is an optional check to make program writing easier. * Compare names of structs and report an error to the user. * btf_prepare_func_args() already checked that t2 struct * is a context type. btf_prepare_func_args() will check * later that t1 struct is a context type as well. */ s1 = btf_name_by_offset(btf1, t1->name_off); s2 = btf_name_by_offset(btf2, t2->name_off); if (strcmp(s1, s2)) { bpf_log(log, "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n", i, fn1, s1, fn2, s2); return -EINVAL; } } return 0; } /* Compare BTFs of given program with BTF of target program */ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf2, const struct btf_type *t2) { struct btf *btf1 = prog->aux->btf; const struct btf_type *t1; u32 btf_id = 0; if (!prog->aux->func_info) { bpf_log(log, "Program extension requires BTF\n"); return -EINVAL; } btf_id = prog->aux->func_info[0].type_id; if (!btf_id) return -EFAULT; t1 = btf_type_by_id(btf1, btf_id); if (!t1 || !btf_type_is_func(t1)) return -EFAULT; return btf_check_func_type_match(log, btf1, t1, btf2, t2); } static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t) { const char *name; t = btf_type_by_id(btf, t->type); /* skip PTR */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); /* allow either struct or struct forward declaration */ if (btf_type_is_struct(t) || (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) { name = btf_str_by_offset(btf, t->name_off); return name && strcmp(name, "bpf_dynptr") == 0; } return false; } struct bpf_cand_cache { const char *name; u32 name_len; u16 kind; u16 cnt; struct { const struct btf *btf; u32 id; } cands[]; }; static DEFINE_MUTEX(cand_cache_mutex); static struct bpf_cand_cache * bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id); static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx, const struct btf *btf, const struct btf_type *t) { struct bpf_cand_cache *cc; struct bpf_core_ctx ctx = { .btf = btf, .log = log, }; u32 kern_type_id, type_id; int err = 0; /* skip PTR and modifiers */ type_id = t->type; t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) { type_id = t->type; t = btf_type_by_id(btf, t->type); } mutex_lock(&cand_cache_mutex); cc = bpf_core_find_cands(&ctx, type_id); if (IS_ERR(cc)) { err = PTR_ERR(cc); bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n", arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), err); goto cand_cache_unlock; } if (cc->cnt != 1) { bpf_log(log, "arg#%d reference type('%s %s') %s\n", arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), cc->cnt == 0 ? "has no matches" : "is ambiguous"); err = cc->cnt == 0 ? -ENOENT : -ESRCH; goto cand_cache_unlock; } if (btf_is_module(cc->cands[0].btf)) { bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n", arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); err = -EOPNOTSUPP; goto cand_cache_unlock; } kern_type_id = cc->cands[0].id; cand_cache_unlock: mutex_unlock(&cand_cache_mutex); if (err) return err; return kern_type_id; } enum btf_arg_tag { ARG_TAG_CTX = BIT_ULL(0), ARG_TAG_NONNULL = BIT_ULL(1), ARG_TAG_TRUSTED = BIT_ULL(2), ARG_TAG_UNTRUSTED = BIT_ULL(3), ARG_TAG_NULLABLE = BIT_ULL(4), ARG_TAG_ARENA = BIT_ULL(5), }; /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - cannot convert BTF. * 0 - Successfully processed BTF and constructed argument expectations. */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) { bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL; struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; const struct btf_type *t, *ref_t, *fn_t; u32 i, nargs, btf_id; const char *tname; if (sub->args_cached) return 0; if (!prog->aux->func_info) { verifier_bug(env, "func_info undefined"); return -EFAULT; } btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) { if (!is_global) /* not fatal for static funcs */ return -EINVAL; bpf_log(log, "Global functions need valid BTF\n"); return -EFAULT; } fn_t = btf_type_by_id(btf, btf_id); if (!fn_t || !btf_type_is_func(fn_t)) { /* These checks were already done by the verifier while loading * struct bpf_func_info */ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", subprog); return -EFAULT; } tname = btf_name_by_offset(btf, fn_t->name_off); if (prog->aux->func_info_aux[subprog].unreliable) { verifier_bug(env, "unreliable BTF for function %s()", tname); return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) prog_type = prog->aux->dst_prog->type; t = btf_type_by_id(btf, fn_t->type); if (!t || !btf_type_is_func_proto(t)) { bpf_log(log, "Invalid type of function %s()\n", tname); return -EFAULT; } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); if (nargs > MAX_BPF_FUNC_REG_ARGS) { if (!is_global) return -EINVAL; bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } /* check that function returns int, exception cb also requires this */ t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { if (!is_global) return -EINVAL; bpf_log(log, "Global function %s() doesn't return scalar. Only those are supported.\n", tname); return -EINVAL; } /* Convert BTF function arguments into verifier types. * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { u32 tags = 0; int id = 0; /* 'arg:<tag>' decl_tag takes precedence over derivation of * register type from BTF type itself */ while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) { const struct btf_type *tag_t = btf_type_by_id(btf, id); const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; /* disallow arg tags in static subprogs */ if (!is_global) { bpf_log(log, "arg#%d type tag is not supported in static functions\n", i); return -EOPNOTSUPP; } if (strcmp(tag, "ctx") == 0) { tags |= ARG_TAG_CTX; } else if (strcmp(tag, "trusted") == 0) { tags |= ARG_TAG_TRUSTED; } else if (strcmp(tag, "untrusted") == 0) { tags |= ARG_TAG_UNTRUSTED; } else if (strcmp(tag, "nonnull") == 0) { tags |= ARG_TAG_NONNULL; } else if (strcmp(tag, "nullable") == 0) { tags |= ARG_TAG_NULLABLE; } else if (strcmp(tag, "arena") == 0) { tags |= ARG_TAG_ARENA; } else { bpf_log(log, "arg#%d has unsupported set of tags\n", i); return -EOPNOTSUPP; } } if (id != -ENOENT) { bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id); return id; } t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_ptr(t)) goto skip_pointer; if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) { if (tags & ~ARG_TAG_CTX) { bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } if ((tags & ARG_TAG_CTX) && btf_validate_prog_ctx_type(log, btf, t, i, prog_type, prog->expected_attach_type)) return -EINVAL; sub->args[i].arg_type = ARG_PTR_TO_CTX; continue; } if (btf_is_dynptr_ptr(btf, t)) { if (tags) { bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; continue; } if (tags & ARG_TAG_TRUSTED) { int kern_type_id; if (tags & ARG_TAG_NONNULL) { bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); if (kern_type_id < 0) return kern_type_id; sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED; if (tags & ARG_TAG_NULLABLE) sub->args[i].arg_type |= PTR_MAYBE_NULL; sub->args[i].btf_id = kern_type_id; continue; } if (tags & ARG_TAG_UNTRUSTED) { struct btf *vmlinux_btf; int kern_type_id; if (tags & ~ARG_TAG_UNTRUSTED) { bpf_log(log, "arg#%d untrusted cannot be combined with any other tags\n", i); return -EINVAL; } ref_t = btf_type_skip_modifiers(btf, t->type, NULL); if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) { sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED; sub->args[i].mem_size = 0; continue; } kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); if (kern_type_id < 0) return kern_type_id; vmlinux_btf = bpf_get_btf_vmlinux(); ref_t = btf_type_by_id(vmlinux_btf, kern_type_id); if (!btf_type_is_struct(ref_t)) { tname = __btf_name_by_offset(vmlinux_btf, t->name_off); bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n", i, btf_type_str(ref_t), tname); return -EINVAL; } sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_UNTRUSTED; sub->args[i].btf_id = kern_type_id; continue; } if (tags & ARG_TAG_ARENA) { if (tags & ~ARG_TAG_ARENA) { bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i); return -EINVAL; } sub->args[i].arg_type = ARG_PTR_TO_ARENA; continue; } if (is_global) { /* generic user data pointer */ u32 mem_size; if (tags & ARG_TAG_NULLABLE) { bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } t = btf_type_skip_modifiers(btf, t->type, NULL); ref_t = btf_resolve_size(btf, t, &mem_size); if (IS_ERR(ref_t)) { bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), PTR_ERR(ref_t)); return -EINVAL; } sub->args[i].arg_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL; if (tags & ARG_TAG_NONNULL) sub->args[i].arg_type &= ~PTR_MAYBE_NULL; sub->args[i].mem_size = mem_size; continue; } skip_pointer: if (tags) { bpf_log(log, "arg#%d has pointer tag, but is not a pointer type\n", i); return -EINVAL; } if (btf_type_is_int(t) || btf_is_any_enum(t)) { sub->args[i].arg_type = ARG_ANYTHING; continue; } if (!is_global) return -EINVAL; bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", i, btf_type_str(t), tname); return -EINVAL; } sub->arg_cnt = nargs; sub->args_cached = true; return 0; } static void btf_type_show(const struct btf *btf, u32 type_id, void *obj, struct btf_show *show) { const struct btf_type *t = btf_type_by_id(btf, type_id); show->btf = btf; memset(&show->state, 0, sizeof(show->state)); memset(&show->obj, 0, sizeof(show->obj)); btf_type_ops(t)->show(btf, t, type_id, obj, 0, show); } __printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt, va_list args) { seq_vprintf((struct seq_file *)show->target, fmt, args); } int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m, u64 flags) { struct btf_show sseq; sseq.target = m; sseq.showfn = btf_seq_show; sseq.flags = flags; btf_type_show(btf, type_id, obj, &sseq); return sseq.state.status; } void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { (void) btf_type_seq_show_flags(btf, type_id, obj, m, BTF_SHOW_NONAME | BTF_SHOW_COMPACT | BTF_SHOW_ZERO | BTF_SHOW_UNSAFE); } struct btf_show_snprintf { struct btf_show show; int len_left; /* space left in string */ int len; /* length we would have written */ }; __printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt, va_list args) { struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show; int len; len = vsnprintf(show->target, ssnprintf->len_left, fmt, args); if (len < 0) { ssnprintf->len_left = 0; ssnprintf->len = len; } else if (len >= ssnprintf->len_left) { /* no space, drive on to get length we would have written */ ssnprintf->len_left = 0; ssnprintf->len += len; } else { ssnprintf->len_left -= len; ssnprintf->len += len; show->target += len; } } int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, char *buf, int len, u64 flags) { struct btf_show_snprintf ssnprintf; ssnprintf.show.target = buf; ssnprintf.show.flags = flags; ssnprintf.show.showfn = btf_snprintf_show; ssnprintf.len_left = len; ssnprintf.len = 0; btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf); /* If we encountered an error, return it. */ if (ssnprintf.show.state.status) return ssnprintf.show.state.status; /* Otherwise return length we would have written */ return ssnprintf.len; } #ifdef CONFIG_PROC_FS static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp) { const struct btf *btf = filp->private_data; seq_printf(m, "btf_id:\t%u\n", btf->id); } #endif static int btf_release(struct inode *inode, struct file *filp) { btf_put(filp->private_data); return 0; } const struct file_operations btf_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_btf_show_fdinfo, #endif .release = btf_release, }; static int __btf_new_fd(struct btf *btf) { return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); } int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { struct btf *btf; int ret; btf = btf_parse(attr, uattr, uattr_size); if (IS_ERR(btf)) return PTR_ERR(btf); ret = btf_alloc_id(btf); if (ret) { btf_free(btf); return ret; } /* * The BTF ID is published to the userspace. * All BTF free must go through call_rcu() from * now on (i.e. free by calling btf_put()). */ ret = __btf_new_fd(btf); if (ret < 0) btf_put(btf); return ret; } struct btf *btf_get_by_fd(int fd) { struct btf *btf; CLASS(fd, f)(fd); btf = __btf_get_by_fd(f); if (!IS_ERR(btf)) refcount_inc(&btf->refcnt); return btf; } int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo; struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; char __user *uname; u32 uinfo_len, uname_len, name_len; int ret = 0; uinfo = u64_to_user_ptr(attr->info.info); uinfo_len = attr->info.info_len; info_copy = min_t(u32, uinfo_len, sizeof(info)); memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; info.id = btf->id; ubtf = u64_to_user_ptr(info.btf); btf_copy = min_t(u32, btf->data_size, info.btf_size); if (copy_to_user(ubtf, btf->data, btf_copy)) return -EFAULT; info.btf_size = btf->data_size; info.kernel_btf = btf->kernel_btf; uname = u64_to_user_ptr(info.name); uname_len = info.name_len; if (!uname ^ !uname_len) return -EINVAL; name_len = strlen(btf->name); info.name_len = name_len; if (uname) { if (uname_len >= name_len + 1) { if (copy_to_user(uname, btf->name, name_len + 1)) return -EFAULT; } else { char zero = '\0'; if (copy_to_user(uname, btf->name, uname_len - 1)) return -EFAULT; if (put_user(zero, uname + uname_len - 1)) return -EFAULT; /* let user-space know about too short buffer */ ret = -ENOSPC; } } if (copy_to_user(uinfo, &info, info_copy) || put_user(info_copy, &uattr->info.info_len)) return -EFAULT; return ret; } int btf_get_fd_by_id(u32 id) { struct btf *btf; int fd; rcu_read_lock(); btf = idr_find(&btf_idr, id); if (!btf || !refcount_inc_not_zero(&btf->refcnt)) btf = ERR_PTR(-ENOENT); rcu_read_unlock(); if (IS_ERR(btf)) return PTR_ERR(btf); fd = __btf_new_fd(btf); if (fd < 0) btf_put(btf); return fd; } u32 btf_obj_id(const struct btf *btf) { return btf->id; } bool btf_is_kernel(const struct btf *btf) { return btf->kernel_btf; } bool btf_is_module(const struct btf *btf) { return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0; } enum { BTF_MODULE_F_LIVE = (1 << 0), }; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module { struct list_head list; struct module *module; struct btf *btf; struct bin_attribute *sysfs_attr; int flags; }; static LIST_HEAD(btf_modules); static DEFINE_MUTEX(btf_module_mutex); static void purge_cand_cache(struct btf *btf); static int btf_module_notify(struct notifier_block *nb, unsigned long op, void *module) { struct btf_module *btf_mod, *tmp; struct module *mod = module; struct btf *btf; int err = 0; if (mod->btf_data_size == 0 || (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE && op != MODULE_STATE_GOING)) goto out; switch (op) { case MODULE_STATE_COMING: btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL); if (!btf_mod) { err = -ENOMEM; goto out; } btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size, mod->btf_base_data, mod->btf_base_data_size); if (IS_ERR(btf)) { kfree(btf_mod); if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) { pr_warn("failed to validate module [%s] BTF: %ld\n", mod->name, PTR_ERR(btf)); err = PTR_ERR(btf); } else { pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n"); } goto out; } err = btf_alloc_id(btf); if (err) { btf_free(btf); kfree(btf_mod); goto out; } purge_cand_cache(NULL); mutex_lock(&btf_module_mutex); btf_mod->module = module; btf_mod->btf = btf; list_add(&btf_mod->list, &btf_modules); mutex_unlock(&btf_module_mutex); if (IS_ENABLED(CONFIG_SYSFS)) { struct bin_attribute *attr; attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) goto out; sysfs_bin_attr_init(attr); attr->attr.name = btf->name; attr->attr.mode = 0444; attr->size = btf->data_size; attr->private = btf->data; attr->read = sysfs_bin_attr_simple_read; err = sysfs_create_bin_file(btf_kobj, attr); if (err) { pr_warn("failed to register module [%s] BTF in sysfs: %d\n", mod->name, err); kfree(attr); err = 0; goto out; } btf_mod->sysfs_attr = attr; } break; case MODULE_STATE_LIVE: mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { if (btf_mod->module != module) continue; btf_mod->flags |= BTF_MODULE_F_LIVE; break; } mutex_unlock(&btf_module_mutex); break; case MODULE_STATE_GOING: mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { if (btf_mod->module != module) continue; list_del(&btf_mod->list); if (btf_mod->sysfs_attr) sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); purge_cand_cache(btf_mod->btf); btf_put(btf_mod->btf); kfree(btf_mod->sysfs_attr); kfree(btf_mod); break; } mutex_unlock(&btf_module_mutex); break; } out: return notifier_from_errno(err); } static struct notifier_block btf_module_nb = { .notifier_call = btf_module_notify, }; static int __init btf_module_init(void) { register_module_notifier(&btf_module_nb); return 0; } fs_initcall(btf_module_init); #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ struct module *btf_try_get_module(const struct btf *btf) { struct module *res = NULL; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module *btf_mod, *tmp; mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { if (btf_mod->btf != btf) continue; /* We must only consider module whose __init routine has * finished, hence we must check for BTF_MODULE_F_LIVE flag, * which is set from the notifier callback for * MODULE_STATE_LIVE. */ if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module)) res = btf_mod->module; break; } mutex_unlock(&btf_module_mutex); #endif return res; } /* Returns struct btf corresponding to the struct module. * This function can return NULL or ERR_PTR. */ static struct btf *btf_get_module_btf(const struct module *module) { #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module *btf_mod, *tmp; #endif struct btf *btf = NULL; if (!module) { btf = bpf_get_btf_vmlinux(); if (!IS_ERR_OR_NULL(btf)) btf_get(btf); return btf; } #ifdef CONFIG_DEBUG_INFO_BTF_MODULES mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { if (btf_mod->module != module) continue; btf_get(btf_mod->btf); btf = btf_mod->btf; break; } mutex_unlock(&btf_module_mutex); #endif return btf; } static int check_btf_kconfigs(const struct module *module, const char *feature) { if (!module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { pr_err("missing vmlinux BTF, cannot register %s\n", feature); return -ENOENT; } if (module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) pr_warn("missing module BTF, cannot register %s\n", feature); return 0; } BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) { struct btf *btf = NULL; int btf_obj_fd = 0; long ret; if (flags) return -EINVAL; if (name_sz <= 1 || name[name_sz - 1]) return -EINVAL; ret = bpf_find_btf_id(name, kind, &btf); if (ret > 0 && btf_is_module(btf)) { btf_obj_fd = __btf_new_fd(btf); if (btf_obj_fd < 0) { btf_put(btf); return btf_obj_fd; } return ret | (((u64)btf_obj_fd) << 32); } if (ret > 0) btf_put(btf); return ret; } const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .func = bpf_btf_find_by_name_kind, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) #define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE /* Validate well-formedness of iter argument type. * On success, return positive BTF ID of iter state's STRUCT type. * On error, negative error is returned. */ int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx) { const struct btf_param *arg; const struct btf_type *t; const char *name; int btf_id; if (btf_type_vlen(func) <= arg_idx) return -EINVAL; arg = &btf_params(func)[arg_idx]; t = btf_type_skip_modifiers(btf, arg->type, NULL); if (!t || !btf_type_is_ptr(t)) return -EINVAL; t = btf_type_skip_modifiers(btf, t->type, &btf_id); if (!t || !__btf_type_is_struct(t)) return -EINVAL; name = btf_name_by_offset(btf, t->name_off); if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) return -EINVAL; return btf_id; } static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, const struct btf_type *func, u32 func_flags) { u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); const char *sfx, *iter_name; const struct btf_type *t; char exp_name[128]; u32 nr_args; int btf_id; /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */ if (!flags || (flags & (flags - 1))) return -EINVAL; /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */ nr_args = btf_type_vlen(func); if (nr_args < 1) return -EINVAL; btf_id = btf_check_iter_arg(btf, func, 0); if (btf_id < 0) return btf_id; /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to * fit nicely in stack slots */ t = btf_type_by_id(btf, btf_id); if (t->size == 0 || (t->size % 8)) return -EINVAL; /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *) * naming pattern */ iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1; if (flags & KF_ITER_NEW) sfx = "new"; else if (flags & KF_ITER_NEXT) sfx = "next"; else /* (flags & KF_ITER_DESTROY) */ sfx = "destroy"; snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx); if (strcmp(func_name, exp_name)) return -EINVAL; /* only iter constructor should have extra arguments */ if (!(flags & KF_ITER_NEW) && nr_args != 1) return -EINVAL; if (flags & KF_ITER_NEXT) { /* bpf_iter_<type>_next() should return pointer */ t = btf_type_skip_modifiers(btf, func->type, NULL); if (!t || !btf_type_is_ptr(t)) return -EINVAL; } if (flags & KF_ITER_DESTROY) { /* bpf_iter_<type>_destroy() should return void */ t = btf_type_by_id(btf, func->type); if (!t || !btf_type_is_void(t)) return -EINVAL; } return 0; } static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) { const struct btf_type *func; const char *func_name; int err; /* any kfunc should be FUNC -> FUNC_PROTO */ func = btf_type_by_id(btf, func_id); if (!func || !btf_type_is_func(func)) return -EINVAL; /* sanity check kfunc name */ func_name = btf_name_by_offset(btf, func->name_off); if (!func_name || !func_name[0]) return -EINVAL; func = btf_type_by_id(btf, func->type); if (!func || !btf_type_is_func_proto(func)) return -EINVAL; if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) { err = btf_check_iter_kfuncs(btf, func_name, func, func_flags); if (err) return err; } return 0; } /* Kernel Function (kfunc) BTF ID set registration API */ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, const struct btf_kfunc_id_set *kset) { struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *add_set = kset->set; bool vmlinux_set = !btf_is_module(btf); bool add_filter = !!kset->filter; struct btf_kfunc_set_tab *tab; struct btf_id_set8 *set; u32 set_cnt, i; int ret; if (hook >= BTF_KFUNC_HOOK_MAX) { ret = -EINVAL; goto end; } if (!add_set->cnt) return 0; tab = btf->kfunc_set_tab; if (tab && add_filter) { u32 i; hook_filter = &tab->hook_filters[hook]; for (i = 0; i < hook_filter->nr_filters; i++) { if (hook_filter->filters[i] == kset->filter) { add_filter = false; break; } } if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) { ret = -E2BIG; goto end; } } if (!tab) { tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN); if (!tab) return -ENOMEM; btf->kfunc_set_tab = tab; } set = tab->sets[hook]; /* Warn when register_btf_kfunc_id_set is called twice for the same hook * for module sets. */ if (WARN_ON_ONCE(set && !vmlinux_set)) { ret = -EINVAL; goto end; } /* In case of vmlinux sets, there may be more than one set being * registered per hook. To create a unified set, we allocate a new set * and concatenate all individual sets being registered. While each set * is individually sorted, they may become unsorted when concatenated, * hence re-sorting the final set again is required to make binary * searching the set using btf_id_set8_contains function work. * * For module sets, we need to allocate as we may need to relocate * BTF ids. */ set_cnt = set ? set->cnt : 0; if (set_cnt > U32_MAX - add_set->cnt) { ret = -EOVERFLOW; goto end; } if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) { ret = -E2BIG; goto end; } /* Grow set */ set = krealloc(tab->sets[hook], struct_size(set, pairs, set_cnt + add_set->cnt), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; goto end; } /* For newly allocated set, initialize set->cnt to 0 */ if (!tab->sets[hook]) set->cnt = 0; tab->sets[hook] = set; /* Concatenate the two sets */ memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0])); /* Now that the set is copied, update with relocated BTF ids */ for (i = set->cnt; i < set->cnt + add_set->cnt; i++) set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id); set->cnt += add_set->cnt; sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL); if (add_filter) { hook_filter = &tab->hook_filters[hook]; hook_filter->filters[hook_filter->nr_filters++] = kset->filter; } return 0; end: btf_free_kfunc_set_tab(btf); return ret; } static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_hook hook, u32 kfunc_btf_id, const struct bpf_prog *prog) { struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *set; u32 *id, i; if (hook >= BTF_KFUNC_HOOK_MAX) return NULL; if (!btf->kfunc_set_tab) return NULL; hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; for (i = 0; i < hook_filter->nr_filters; i++) { if (hook_filter->filters[i](prog, kfunc_btf_id)) return NULL; } set = btf->kfunc_set_tab->sets[hook]; if (!set) return NULL; id = btf_id_set8_contains(set, kfunc_btf_id); if (!id) return NULL; /* The flags for BTF ID are located next to it */ return id + 1; } static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) { switch (prog_type) { case BPF_PROG_TYPE_UNSPEC: return BTF_KFUNC_HOOK_COMMON; case BPF_PROG_TYPE_XDP: return BTF_KFUNC_HOOK_XDP; case BPF_PROG_TYPE_SCHED_CLS: return BTF_KFUNC_HOOK_TC; case BPF_PROG_TYPE_STRUCT_OPS: return BTF_KFUNC_HOOK_STRUCT_OPS; case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_LSM: return BTF_KFUNC_HOOK_TRACING; case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: return BTF_KFUNC_HOOK_CGROUP; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; case BPF_PROG_TYPE_SK_SKB: return BTF_KFUNC_HOOK_SK_SKB; case BPF_PROG_TYPE_SOCKET_FILTER: return BTF_KFUNC_HOOK_SOCKET_FILTER; case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: return BTF_KFUNC_HOOK_LWT; case BPF_PROG_TYPE_NETFILTER: return BTF_KFUNC_HOOK_NETFILTER; case BPF_PROG_TYPE_KPROBE: return BTF_KFUNC_HOOK_KPROBE; default: return BTF_KFUNC_HOOK_MAX; } } /* Caution: * Reference to the module (obtained using btf_try_get_module) corresponding to * the struct btf *MUST* be held when calling this function from verifier * context. This is usually true as we stash references in prog's kfunc_btf_tab; * keeping the reference for the duration of the call provides the necessary * protection for looking up a well-formed btf->kfunc_set_tab. */ u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { enum bpf_prog_type prog_type = resolve_prog_type(prog); enum btf_kfunc_hook hook; u32 *kfunc_flags; kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog); if (kfunc_flags) return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog); } u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog); } static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, const struct btf_kfunc_id_set *kset) { struct btf *btf; int ret, i; btf = btf_get_module_btf(kset->owner); if (!btf) return check_btf_kconfigs(kset->owner, "kfunc"); if (IS_ERR(btf)) return PTR_ERR(btf); for (i = 0; i < kset->set->cnt; i++) { ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id), kset->set->pairs[i].flags); if (ret) goto err_out; } ret = btf_populate_kfunc_set(btf, hook, kset); err_out: btf_put(btf); return ret; } /* This function must be invoked only from initcalls/module init functions */ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *kset) { enum btf_kfunc_hook hook; /* All kfuncs need to be tagged as such in BTF. * WARN() for initcall registrations that do not check errors. */ if (!(kset->set->flags & BTF_SET8_KFUNCS)) { WARN_ON(!kset->owner); return -EINVAL; } hook = bpf_prog_type_to_kfunc_hook(prog_type); return __register_btf_kfunc_id_set(hook, kset); } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); /* This function must be invoked only from initcalls/module init functions */ int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset) { return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset); } EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set); s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) { struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; struct btf_id_dtor_kfunc *dtor; if (!tab) return -ENOENT; /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func. */ BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0); dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func); if (!dtor) return -ENOENT; return dtor->kfunc_btf_id; } static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt) { const struct btf_type *dtor_func, *dtor_func_proto, *t; const struct btf_param *args; s32 dtor_btf_id; u32 nr_args, i; for (i = 0; i < cnt; i++) { dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id); dtor_func = btf_type_by_id(btf, dtor_btf_id); if (!dtor_func || !btf_type_is_func(dtor_func)) return -EINVAL; dtor_func_proto = btf_type_by_id(btf, dtor_func->type); if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto)) return -EINVAL; /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */ t = btf_type_by_id(btf, dtor_func_proto->type); if (!t || !btf_type_is_void(t)) return -EINVAL; nr_args = btf_type_vlen(dtor_func_proto); if (nr_args != 1) return -EINVAL; args = btf_params(dtor_func_proto); t = btf_type_by_id(btf, args[0].type); /* Allow any pointer type, as width on targets Linux supports * will be same for all pointer types (i.e. sizeof(void *)) */ if (!t || !btf_type_is_ptr(t)) return -EINVAL; } return 0; } /* This function must be invoked only from initcalls/module init functions */ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner) { struct btf_id_dtor_kfunc_tab *tab; struct btf *btf; u32 tab_cnt, i; int ret; btf = btf_get_module_btf(owner); if (!btf) return check_btf_kconfigs(owner, "dtor kfuncs"); if (IS_ERR(btf)) return PTR_ERR(btf); if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); ret = -E2BIG; goto end; } /* Ensure that the prototype of dtor kfuncs being registered is sane */ ret = btf_check_dtor_kfuncs(btf,