480 1 5 5 2 1 1 2 2 1 1 480 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 // SPDX-License-Identifier: GPL-2.0 /* * Encryption policy functions for per-file encryption support. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility. * * Originally written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. * Modified by Eric Biggers, 2019 for v2 policy support. */ #include <linux/random.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/mount.h> #include "fscrypt_private.h" /** * fscrypt_policies_equal() - check whether two encryption policies are the same * @policy1: the first policy * @policy2: the second policy * * Return: %true if equal, else %false */ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2) { if (policy1->version != policy2->version) return false; return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } static const union fscrypt_policy * fscrypt_get_dummy_policy(struct super_block *sb) { if (!sb->s_cop->get_dummy_policy) return NULL; return sb->s_cop->get_dummy_policy(sb); } static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_CTS) return true; if (contents_mode == FSCRYPT_MODE_AES_128_CBC && filenames_mode == FSCRYPT_MODE_AES_128_CTS) return true; if (contents_mode == FSCRYPT_MODE_ADIANTUM && filenames_mode == FSCRYPT_MODE_ADIANTUM) return true; return false; } static bool supported_direct_key_modes(const struct inode *inode, u32 contents_mode, u32 filenames_mode) { const struct fscrypt_mode *mode; if (contents_mode != filenames_mode) { fscrypt_warn(inode, "Direct key flag not allowed with different contents and filenames modes"); return false; } mode = &fscrypt_modes[contents_mode]; if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) { fscrypt_warn(inode, "Direct key flag not allowed with %s", mode->friendly_name); return false; } return true; } static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode, const char *type, int max_ino_bits, int max_lblk_bits) { struct super_block *sb = inode->i_sb; int ino_bits = 64, lblk_bits = 64; /* * IV_INO_LBLK_* exist only because of hardware limitations, and * currently the only known use case for them involves AES-256-XTS. * That's also all we test currently. For these reasons, for now only * allow AES-256-XTS here. This can be relaxed later if a use case for * IV_INO_LBLK_* with other encryption modes arises. */ if (policy->contents_encryption_mode != FSCRYPT_MODE_AES_256_XTS) { fscrypt_warn(inode, "Can't use %s policy with contents mode other than AES-256-XTS", type); return false; } /* * It's unsafe to include inode numbers in the IVs if the filesystem can * potentially renumber inodes, e.g. via filesystem shrinking. */ if (!sb->s_cop->has_stable_inodes || !sb->s_cop->has_stable_inodes(sb)) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers", type, sb->s_id); return false; } if (sb->s_cop->get_ino_and_lblk_bits) sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); if (ino_bits > max_ino_bits) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because its inode numbers are too long", type, sb->s_id); return false; } if (lblk_bits > max_lblk_bits) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because its block numbers are too long", type, sb->s_id); return false; } return true; } static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, const struct inode *inode) { if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", policy->contents_encryption_mode, policy->filenames_encryption_mode); return false; } if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); return false; } if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if (IS_CASEFOLDED(inode)) { /* With v1, there's no way to derive dirhash keys. */ fscrypt_warn(inode, "v1 policies can't be used on casefolded directories"); return false; } return true; } static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { int count = 0; if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", policy->contents_encryption_mode, policy->filenames_encryption_mode); return false; } if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | FSCRYPT_POLICY_FLAG_DIRECT_KEY | FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); return false; } count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY); count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64); count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32); if (count > 1) { fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)", policy->flags); return false; } if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", 32, 32)) return false; /* * IV_INO_LBLK_32 hashes the inode number, so in principle it can * support any ino_bits. However, currently the inode number is gotten * from inode::i_ino which is 'unsigned long'. So for now the * implementation limit is 32 bits. */ if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", 32, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { fscrypt_warn(inode, "Reserved bits set in encryption policy"); return false; } return true; } /** * fscrypt_supported_policy() - check whether an encryption policy is supported * @policy_u: the encryption policy * @inode: the inode on which the policy will be used * * Given an encryption policy, check whether all its encryption modes and other * settings are supported by this kernel on the given inode. (But we don't * currently don't check for crypto API support here, so attempting to use an * algorithm not configured into the crypto API will still fail later.) * * Return: %true if supported, else %false */ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode) { switch (policy_u->version) { case FSCRYPT_POLICY_V1: return fscrypt_supported_v1_policy(&policy_u->v1, inode); case FSCRYPT_POLICY_V2: return fscrypt_supported_v2_policy(&policy_u->v2, inode); } return false; } /** * fscrypt_new_context() - create a new fscrypt_context * @ctx_u: output context * @policy_u: input policy * @nonce: nonce to use * * Create an fscrypt_context for an inode that is being assigned the given * encryption policy. @nonce must be a new random nonce. * * Return: the size of the new context in bytes. */ static int fscrypt_new_context(union fscrypt_context *ctx_u, const union fscrypt_policy *policy_u, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE]) { memset(ctx_u, 0, sizeof(*ctx_u)); switch (policy_u->version) { case FSCRYPT_POLICY_V1: { const struct fscrypt_policy_v1 *policy = &policy_u->v1; struct fscrypt_context_v1 *ctx = &ctx_u->v1; ctx->version = FSCRYPT_CONTEXT_V1; ctx->contents_encryption_mode = policy->contents_encryption_mode; ctx->filenames_encryption_mode = policy->filenames_encryption_mode; ctx->flags = policy->flags; memcpy(ctx->master_key_descriptor, policy->master_key_descriptor, sizeof(ctx->master_key_descriptor)); memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } case FSCRYPT_POLICY_V2: { const struct fscrypt_policy_v2 *policy = &policy_u->v2; struct fscrypt_context_v2 *ctx = &ctx_u->v2; ctx->version = FSCRYPT_CONTEXT_V2; ctx->contents_encryption_mode = policy->contents_encryption_mode; ctx->filenames_encryption_mode = policy->filenames_encryption_mode; ctx->flags = policy->flags; memcpy(ctx->master_key_identifier, policy->master_key_identifier, sizeof(ctx->master_key_identifier)); memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } } BUG(); } /** * fscrypt_policy_from_context() - convert an fscrypt_context to * an fscrypt_policy * @policy_u: output policy * @ctx_u: input context * @ctx_size: size of input context in bytes * * Given an fscrypt_context, build the corresponding fscrypt_policy. * * Return: 0 on success, or -EINVAL if the fscrypt_context has an unrecognized * version number or size. * * This does *not* validate the settings within the policy itself, e.g. the * modes, flags, and reserved bits. Use fscrypt_supported_policy() for that. */ int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size) { memset(policy_u, 0, sizeof(*policy_u)); if (!fscrypt_context_is_valid(ctx_u, ctx_size)) return -EINVAL; switch (ctx_u->version) { case FSCRYPT_CONTEXT_V1: { const struct fscrypt_context_v1 *ctx = &ctx_u->v1; struct fscrypt_policy_v1 *policy = &policy_u->v1; policy->version = FSCRYPT_POLICY_V1; policy->contents_encryption_mode = ctx->contents_encryption_mode; policy->filenames_encryption_mode = ctx->filenames_encryption_mode; policy->flags = ctx->flags; memcpy(policy->master_key_descriptor, ctx->master_key_descriptor, sizeof(policy->master_key_descriptor)); return 0; } case FSCRYPT_CONTEXT_V2: { const struct fscrypt_context_v2 *ctx = &ctx_u->v2; struct fscrypt_policy_v2 *policy = &policy_u->v2; policy->version = FSCRYPT_POLICY_V2; policy->contents_encryption_mode = ctx->contents_encryption_mode; policy->filenames_encryption_mode = ctx->filenames_encryption_mode; policy->flags = ctx->flags; memcpy(policy->__reserved, ctx->__reserved, sizeof(policy->__reserved)); memcpy(policy->master_key_identifier, ctx->master_key_identifier, sizeof(policy->master_key_identifier)); return 0; } } /* unreachable */ return -EINVAL; } /* Retrieve an inode's encryption policy */ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) { const struct fscrypt_info *ci; union fscrypt_context ctx; int ret; ci = fscrypt_get_info(inode); if (ci) { /* key available, use the cached policy */ *policy = ci->ci_policy; return 0; } if (!IS_ENCRYPTED(inode)) return -ENODATA; ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (ret < 0) return (ret == -ERANGE) ? -EINVAL : ret; return fscrypt_policy_from_context(policy, &ctx, ret); } static int set_encryption_policy(struct inode *inode, const union fscrypt_policy *policy) { u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; union fscrypt_context ctx; int ctxsize; int err; if (!fscrypt_supported_policy(policy, inode)) return -EINVAL; switch (policy->version) { case FSCRYPT_POLICY_V1: /* * The original encryption policy version provided no way of * verifying that the correct master key was supplied, which was * insecure in scenarios where multiple users have access to the * same encrypted files (even just read-only access). The new * encryption policy version fixes this and also implies use of * an improved key derivation function and allows non-root users * to securely remove keys. So as long as compatibility with * old kernels isn't required, it is recommended to use the new * policy version for all new encrypted directories. */ pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n", current->comm, current->pid); break; case FSCRYPT_POLICY_V2: err = fscrypt_verify_key_added(inode->i_sb, policy->v2.master_key_identifier); if (err) return err; if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy. This should only be used if there are certain hardware limitations.\n", current->comm, current->pid); break; default: WARN_ON(1); return -EINVAL; } get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); ctxsize = fscrypt_new_context(&ctx, policy, nonce); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL); } int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { union fscrypt_policy policy; union fscrypt_policy existing_policy; struct inode *inode = file_inode(filp); u8 version; int size; int ret; if (get_user(policy.version, (const u8 __user *)arg)) return -EFAULT; size = fscrypt_policy_size(&policy); if (size <= 0) return -EINVAL; /* * We should just copy the remaining 'size - 1' bytes here, but a * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to * think that size can be 0 here (despite the check above!) *and* that * it's a compile-time constant. Thus it would think copy_from_user() * is passed compile-time constant ULONG_MAX, causing the compile-time * buffer overflow check to fail, breaking the build. This only occurred * when building an i386 kernel with -Os and branch profiling enabled. * * Work around it by just copying the first byte again... */ version = policy.version; if (copy_from_user(&policy, arg, size)) return -EFAULT; policy.version = version; if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(filp); if (ret) return ret; inode_lock(inode); ret = fscrypt_get_policy(inode, &existing_policy); if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; else if (IS_DEADDIR(inode)) ret = -ENOENT; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else ret = set_encryption_policy(inode, &policy); } else if (ret == -EINVAL || (ret == 0 && !fscrypt_policies_equal(&policy, &existing_policy))) { /* The file already uses a different encryption policy. */ ret = -EEXIST; } inode_unlock(inode); mnt_drop_write_file(filp); return ret; } EXPORT_SYMBOL(fscrypt_ioctl_set_policy); /* Original ioctl version; can only get the original policy version */ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { union fscrypt_policy policy; int err; err = fscrypt_get_policy(file_inode(filp), &policy); if (err) return err; if (policy.version != FSCRYPT_POLICY_V1) return -EINVAL; if (copy_to_user(arg, &policy, sizeof(policy.v1))) return -EFAULT; return 0; } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); /* Extended ioctl version; can get policies of any version */ int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg) { struct fscrypt_get_policy_ex_arg arg; union fscrypt_policy *policy = (union fscrypt_policy *)&arg.policy; size_t policy_size; int err; /* arg is policy_size, then policy */ BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0); BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) != offsetof(typeof(arg), policy)); BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy)); err = fscrypt_get_policy(file_inode(filp), policy); if (err) return err; policy_size = fscrypt_policy_size(policy); if (copy_from_user(&arg, uarg, sizeof(arg.policy_size))) return -EFAULT; if (policy_size > arg.policy_size) return -EOVERFLOW; arg.policy_size = policy_size; if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex); /* FS_IOC_GET_ENCRYPTION_NONCE: retrieve file's encryption nonce for testing */ int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) { struct inode *inode = file_inode(filp); union fscrypt_context ctx; int ret; ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (ret < 0) return ret; if (!fscrypt_context_is_valid(&ctx, ret)) return -EINVAL; if (copy_to_user(arg, fscrypt_context_nonce(&ctx), FSCRYPT_FILE_NONCE_SIZE)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce); /** * fscrypt_has_permitted_context() - is a file's encryption policy permitted * within its directory? * * @parent: inode for parent directory * @child: inode for file being looked up, opened, or linked into @parent * * Filesystems must call this before permitting access to an inode in a * situation where the parent directory is encrypted (either before allowing * ->lookup() to succeed, or for a regular file before allowing it to be opened) * and before any operation that involves linking an inode into an encrypted * directory, including link, rename, and cross rename. It enforces the * constraint that within a given encrypted directory tree, all files use the * same encryption policy. The pre-access check is needed to detect potentially * malicious offline violations of this constraint, while the link and rename * checks are needed to prevent online violations of this constraint. * * Return: 1 if permitted, 0 if forbidden. */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { union fscrypt_policy parent_policy, child_policy; int err, err1, err2; /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && !S_ISLNK(child->i_mode)) return 1; /* No restrictions if the parent directory is unencrypted */ if (!IS_ENCRYPTED(parent)) return 1; /* Encrypted directories must not contain unencrypted files */ if (!IS_ENCRYPTED(child)) return 0; /* * Both parent and child are encrypted, so verify they use the same * encryption policy. Compare the fscrypt_info structs if the keys are * available, otherwise retrieve and compare the fscrypt_contexts. * * Note that the fscrypt_context retrieval will be required frequently * when accessing an encrypted directory tree without the key. * Performance-wise this is not a big deal because we already don't * really optimize for file access without the key (to the extent that * such access is even possible), given that any attempted access * already causes a fscrypt_context retrieval and keyring search. * * In any case, if an unexpected error occurs, fall back to "forbidden". */ err = fscrypt_get_encryption_info(parent, true); if (err) return 0; err = fscrypt_get_encryption_info(child, true); if (err) return 0; err1 = fscrypt_get_policy(parent, &parent_policy); err2 = fscrypt_get_policy(child, &child_policy); /* * Allow the case where the parent and child both have an unrecognized * encryption policy, so that files with an unrecognized encryption * policy can be deleted. */ if (err1 == -EINVAL && err2 == -EINVAL) return 1; if (err1 || err2) return 0; return fscrypt_policies_equal(&parent_policy, &child_policy); } EXPORT_SYMBOL(fscrypt_has_permitted_context); /* * Return the encryption policy that new files in the directory will inherit, or * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also * ensure that its key is set up, so that the new filename can be encrypted. */ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) { int err; if (IS_ENCRYPTED(dir)) { err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); return &dir->i_crypt_info->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); } /** * fscrypt_set_context() - Set the fscrypt context of a new inode * @inode: a new inode * @fs_data: private data given by FS and passed to ->set_context() * * This should be called after fscrypt_prepare_new_inode(), generally during a * filesystem transaction. Everything here must be %GFP_NOFS-safe. * * Return: 0 on success, -errno on failure */ int fscrypt_set_context(struct inode *inode, void *fs_data) { struct fscrypt_info *ci = inode->i_crypt_info; union fscrypt_context ctx; int ctxsize; /* fscrypt_prepare_new_inode() should have set up the key already. */ if (WARN_ON_ONCE(!ci)) return -ENOKEY; BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); /* * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data); } EXPORT_SYMBOL_GPL(fscrypt_set_context); /** * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' * @sb: the filesystem on which test_dummy_encryption is being specified * @arg: the argument to the test_dummy_encryption option. May be NULL. * @dummy_policy: the filesystem's current dummy policy (input/output, see * below) * * Handle the test_dummy_encryption mount option by creating a dummy encryption * policy, saving it in @dummy_policy, and adding the corresponding dummy * encryption key to the filesystem. If the @dummy_policy is already set, then * instead validate that it matches @arg. Don't support changing it via * remount, as that is difficult to do safely. * * Return: 0 on success (dummy policy set, or the same policy is already set); * -EEXIST if a different dummy policy is already set; * or another -errno value. */ int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, struct fscrypt_dummy_policy *dummy_policy) { struct fscrypt_key_specifier key_spec = { 0 }; int version; union fscrypt_policy *policy = NULL; int err; if (!arg) arg = "v2"; if (!strcmp(arg, "v1")) { version = FSCRYPT_POLICY_V1; key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; memset(key_spec.u.descriptor, 0x42, FSCRYPT_KEY_DESCRIPTOR_SIZE); } else if (!strcmp(arg, "v2")) { version = FSCRYPT_POLICY_V2; key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; /* key_spec.u.identifier gets filled in when adding the key */ } else { err = -EINVAL; goto out; } policy = kzalloc(sizeof(*policy), GFP_KERNEL); if (!policy) { err = -ENOMEM; goto out; } err = fscrypt_add_test_dummy_key(sb, &key_spec); if (err) goto out; policy->version = version; switch (policy->version) { case FSCRYPT_POLICY_V1: policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); break; case FSCRYPT_POLICY_V2: policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; memcpy(policy->v2.master_key_identifier, key_spec.u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); break; default: WARN_ON(1); err = -EINVAL; goto out; } if (dummy_policy->policy) { if (fscrypt_policies_equal(policy, dummy_policy->policy)) err = 0; else err = -EEXIST; goto out; } dummy_policy->policy = policy; policy = NULL; err = 0; out: kfree(policy); return err; } EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); /** * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' * @seq: the seq_file to print the option to * @sep: the separator character to use * @sb: the filesystem whose options are being shown * * Show the test_dummy_encryption mount option, if it was specified. * This is mainly used for /proc/mounts. */ void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb); int vers; if (!policy) return; vers = policy->version; if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */ vers = 1; seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers); } EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption);
17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 MLME SAP interface * * Copyright (c) 2009, Jouni Malinen <j@w1.fi> * Copyright (c) 2015 Intel Deutschland GmbH * Copyright (C) 2019-2020 Intel Corporation */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/slab.h> #include <linux/wireless.h> #include <net/cfg80211.h> #include <net/iw_handler.h> #include "core.h" #include "nl80211.h" #include "rdev-ops.h" void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, const u8 *buf, size_t len, int uapsd_queues, const u8 *req_ies, size_t req_ies_len) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; struct cfg80211_connect_resp_params cr; const u8 *resp_ie = mgmt->u.assoc_resp.variable; size_t resp_ie_len = len - offsetof(struct ieee80211_mgmt, u.assoc_resp.variable); if (bss->channel->band == NL80211_BAND_S1GHZ) { resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable; resp_ie_len = len - offsetof(struct ieee80211_mgmt, u.s1g_assoc_resp.variable); } memset(&cr, 0, sizeof(cr)); cr.status = (int)le16_to_cpu(mgmt->u.assoc_resp.status_code); cr.bssid = mgmt->bssid; cr.bss = bss; cr.req_ie = req_ies; cr.req_ie_len = req_ies_len; cr.resp_ie = resp_ie; cr.resp_ie_len = resp_ie_len; cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED; trace_cfg80211_send_rx_assoc(dev, bss); /* * This is a bit of a hack, we don't notify userspace of * a (re-)association reply if we tried to send a reassoc * and got a reject -- we only try again with an assoc * frame instead of reassoc. */ if (cfg80211_sme_rx_assoc_resp(wdev, cr.status)) { cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); return; } nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL, uapsd_queues, req_ies, req_ies_len); /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, &cr, cr.status == WLAN_STATUS_SUCCESS); } EXPORT_SYMBOL(cfg80211_rx_assoc_resp); static void cfg80211_process_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); nl80211_send_rx_auth(rdev, wdev->netdev, buf, len, GFP_KERNEL); cfg80211_sme_rx_auth(wdev, buf, len); } static void cfg80211_process_deauth(struct wireless_dev *wdev, const u8 *buf, size_t len, bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); if (!wdev->current_bss || !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); cfg80211_sme_deauth(wdev); } static void cfg80211_process_disassoc(struct wireless_dev *wdev, const u8 *buf, size_t len, bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); if (WARN_ON(!wdev->current_bss || !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); cfg80211_sme_disassoc(wdev); } void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_mgmt *mgmt = (void *)buf; ASSERT_WDEV_LOCK(wdev); trace_cfg80211_rx_mlme_mgmt(dev, buf, len); if (WARN_ON(len < 2)) return; if (ieee80211_is_auth(mgmt->frame_control)) cfg80211_process_auth(wdev, buf, len); else if (ieee80211_is_deauth(mgmt->frame_control)) cfg80211_process_deauth(wdev, buf, len, false); else if (ieee80211_is_disassoc(mgmt->frame_control)) cfg80211_process_disassoc(wdev, buf, len, false); } EXPORT_SYMBOL(cfg80211_rx_mlme_mgmt); void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_send_auth_timeout(dev, addr); nl80211_send_auth_timeout(rdev, dev, addr, GFP_KERNEL); cfg80211_sme_auth_timeout(wdev); } EXPORT_SYMBOL(cfg80211_auth_timeout); void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_send_assoc_timeout(dev, bss->bssid); nl80211_send_assoc_timeout(rdev, dev, bss->bssid, GFP_KERNEL); cfg80211_sme_assoc_timeout(wdev); cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); } EXPORT_SYMBOL(cfg80211_assoc_timeout); void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; cfg80211_sme_abandon_assoc(wdev); cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); } EXPORT_SYMBOL(cfg80211_abandon_assoc); void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len, bool reconnect) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_mgmt *mgmt = (void *)buf; ASSERT_WDEV_LOCK(wdev); trace_cfg80211_tx_mlme_mgmt(dev, buf, len, reconnect); if (WARN_ON(len < 2)) return; if (ieee80211_is_deauth(mgmt->frame_control)) cfg80211_process_deauth(wdev, buf, len, reconnect); else cfg80211_process_disassoc(wdev, buf, len, reconnect); } EXPORT_SYMBOL(cfg80211_tx_mlme_mgmt); void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr, enum nl80211_key_type key_type, int key_id, const u8 *tsc, gfp_t gfp) { struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; char *buf = kmalloc(128, gfp); if (buf) { sprintf(buf, "MLME-MICHAELMICFAILURE.indication(" "keyid=%d %scast addr=%pM)", key_id, key_type == NL80211_KEYTYPE_GROUP ? "broad" : "uni", addr); memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = strlen(buf); wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf); kfree(buf); } #endif trace_cfg80211_michael_mic_failure(dev, addr, key_type, key_id, tsc); nl80211_michael_mic_failure(rdev, dev, addr, key_type, key_id, tsc, gfp); } EXPORT_SYMBOL(cfg80211_michael_mic_failure); /* some MLME handling for userspace SME */ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan, enum nl80211_auth_type auth_type, const u8 *bssid, const u8 *ssid, int ssid_len, const u8 *ie, int ie_len, const u8 *key, int key_len, int key_idx, const u8 *auth_data, int auth_data_len) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_auth_request req = { .ie = ie, .ie_len = ie_len, .auth_data = auth_data, .auth_data_len = auth_data_len, .auth_type = auth_type, .key = key, .key_len = key_len, .key_idx = key_idx, }; int err; ASSERT_WDEV_LOCK(wdev); if (auth_type == NL80211_AUTHTYPE_SHARED_KEY) if (!key || !key_len || key_idx < 0 || key_idx > 3) return -EINVAL; if (wdev->current_bss && ether_addr_equal(bssid, wdev->current_bss->pub.bssid)) return -EALREADY; req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); if (!req.bss) return -ENOENT; err = rdev_auth(rdev, dev, &req); cfg80211_put_bss(&rdev->wiphy, req.bss); return err; } /* Do a logical ht_capa &= ht_capa_mask. */ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, const struct ieee80211_ht_cap *ht_capa_mask) { int i; u8 *p1, *p2; if (!ht_capa_mask) { memset(ht_capa, 0, sizeof(*ht_capa)); return; } p1 = (u8*)(ht_capa); p2 = (u8*)(ht_capa_mask); for (i = 0; i < sizeof(*ht_capa); i++) p1[i] &= p2[i]; } /* Do a logical vht_capa &= vht_capa_mask. */ void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, const struct ieee80211_vht_cap *vht_capa_mask) { int i; u8 *p1, *p2; if (!vht_capa_mask) { memset(vht_capa, 0, sizeof(*vht_capa)); return; } p1 = (u8*)(vht_capa); p2 = (u8*)(vht_capa_mask); for (i = 0; i < sizeof(*vht_capa); i++) p1[i] &= p2[i]; } int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan, const u8 *bssid, const u8 *ssid, int ssid_len, struct cfg80211_assoc_request *req) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; ASSERT_WDEV_LOCK(wdev); if (wdev->current_bss && (!req->prev_bssid || !ether_addr_equal(wdev->current_bss->pub.bssid, req->prev_bssid))) return -EALREADY; cfg80211_oper_and_ht_capa(&req->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); cfg80211_oper_and_vht_capa(&req->vht_capa_mask, rdev->wiphy.vht_capa_mod_mask); req->bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); if (!req->bss) return -ENOENT; err = rdev_assoc(rdev, dev, req); if (!err) cfg80211_hold_bss(bss_from_pub(req->bss)); else cfg80211_put_bss(&rdev->wiphy, req->bss); return err; } int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, bool local_state_change) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_deauth_request req = { .bssid = bssid, .reason_code = reason, .ie = ie, .ie_len = ie_len, .local_state_change = local_state_change, }; ASSERT_WDEV_LOCK(wdev); if (local_state_change && (!wdev->current_bss || !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) return 0; if (ether_addr_equal(wdev->disconnect_bssid, bssid) || (wdev->current_bss && ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) wdev->conn_owner_nlportid = 0; return rdev_deauth(rdev, dev, &req); } int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, bool local_state_change) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_disassoc_request req = { .reason_code = reason, .local_state_change = local_state_change, .ie = ie, .ie_len = ie_len, }; int err; ASSERT_WDEV_LOCK(wdev); if (!wdev->current_bss) return -ENOTCONN; if (ether_addr_equal(wdev->current_bss->pub.bssid, bssid)) req.bss = &wdev->current_bss->pub; else return -ENOTCONN; err = rdev_disassoc(rdev, dev, &req); if (err) return err; /* driver should have reported the disassoc */ WARN_ON(wdev->current_bss); return 0; } void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; u8 bssid[ETH_ALEN]; ASSERT_WDEV_LOCK(wdev); if (!rdev->ops->deauth) return; if (!wdev->current_bss) return; memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN); cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); } struct cfg80211_mgmt_registration { struct list_head list; struct wireless_dev *wdev; u32 nlportid; int match_len; __le16 frame_type; bool multicast_rx; u8 match[]; }; static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct wireless_dev *tmp; struct cfg80211_mgmt_registration *reg; struct mgmt_frame_regs upd = {}; lockdep_assert_held(&rdev->wiphy.mtx); spin_lock_bh(&rdev->mgmt_registrations_lock); if (!wdev->mgmt_registrations_need_update) { spin_unlock_bh(&rdev->mgmt_registrations_lock); return; } rcu_read_lock(); list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { list_for_each_entry(reg, &tmp->mgmt_registrations, list) { u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); u32 mcast_mask = 0; if (reg->multicast_rx) mcast_mask = mask; upd.global_stypes |= mask; upd.global_mcast_stypes |= mcast_mask; if (tmp == wdev) { upd.interface_stypes |= mask; upd.interface_mcast_stypes |= mcast_mask; } } } rcu_read_unlock(); wdev->mgmt_registrations_need_update = 0; spin_unlock_bh(&rdev->mgmt_registrations_lock); rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; rdev = container_of(wk, struct cfg80211_registered_device, mgmt_registrations_update_wk); wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_mgmt_registrations_update(wdev); wiphy_unlock(&rdev->wiphy); } int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, int match_len, bool multicast_rx, struct netlink_ext_ack *extack) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; bool update_multicast = false; if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT) { NL_SET_ERR_MSG(extack, "frame type not management"); return -EINVAL; } if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) { NL_SET_ERR_MSG(extack, "Invalid frame type"); return -EINVAL; } mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type))) { NL_SET_ERR_MSG(extack, "Registration to specific type not supported"); return -EINVAL; } /* * To support Pre Association Security Negotiation (PASN), registration * for authentication frames should be supported. However, as some * versions of the user space daemons wrongly register to all types of * authentication frames (which might result in unexpected behavior) * allow such registration if the request is for a specific * authentication algorithm number. */ if (wdev->iftype == NL80211_IFTYPE_STATION && (frame_type & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_AUTH && !(match_data && match_len >= 2)) { NL_SET_ERR_MSG(extack, "Authentication algorithm number required"); return -EINVAL; } nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL); if (!nreg) return -ENOMEM; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { int mlen = min(match_len, reg->match_len); if (frame_type != le16_to_cpu(reg->frame_type)) continue; if (memcmp(reg->match, match_data, mlen) == 0) { if (reg->multicast_rx != multicast_rx) { update_multicast = true; reg->multicast_rx = multicast_rx; break; } NL_SET_ERR_MSG(extack, "Match already configured"); err = -EALREADY; break; } } if (err) goto out; if (update_multicast) { kfree(nreg); } else { memcpy(nreg->match, match_data, match_len); nreg->match_len = match_len; nreg->nlportid = snd_portid; nreg->frame_type = cpu_to_le16(frame_type); nreg->wdev = wdev; nreg->multicast_rx = multicast_rx; list_add(&nreg->list, &wdev->mgmt_registrations); } wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); return 0; out: kfree(nreg); spin_unlock_bh(&rdev->mgmt_registrations_lock); return err; } void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { if (reg->nlportid != nlportid) continue; list_del(&reg->list); kfree(reg); wdev->mgmt_registrations_need_update = 1; schedule_work(&rdev->mgmt_registrations_update_wk); } spin_unlock_bh(&rdev->mgmt_registrations_lock); if (nlportid && rdev->crit_proto_nlportid == nlportid) { rdev->crit_proto_nlportid = 0; rdev_crit_proto_stop(rdev, wdev); } if (nlportid == wdev->ap_unexpected_nlportid) wdev->ap_unexpected_nlportid = 0; } void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { list_del(&reg->list); kfree(reg); } wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); } int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { const struct ieee80211_mgmt *mgmt; u16 stype; if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; if (params->len < 24 + 1) return -EINVAL; mgmt = (const struct ieee80211_mgmt *)params->buf; if (!ieee80211_is_mgmt(mgmt->frame_control)) return -EINVAL; stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE; if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4))) return -EINVAL; if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { int err = 0; wdev_lock(wdev); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: if (!wdev->current_bss) { err = -ENOTCONN; break; } if (!ether_addr_equal(wdev->current_bss->pub.bssid, mgmt->bssid)) { err = -ENOTCONN; break; } /* * check for IBSS DA must be done by driver as * cfg80211 doesn't track the stations */ if (wdev->iftype == NL80211_IFTYPE_ADHOC) break; /* for station, check that DA is the AP */ if (!ether_addr_equal(wdev->current_bss->pub.bssid, mgmt->da)) { err = -ENOTCONN; break; } break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP_VLAN: if (!ether_addr_equal(mgmt->bssid, wdev_address(wdev))) err = -EINVAL; break; case NL80211_IFTYPE_MESH_POINT: if (!ether_addr_equal(mgmt->sa, mgmt->bssid)) { err = -EINVAL; break; } /* * check for mesh DA must be done by driver as * cfg80211 doesn't track the stations */ break; case NL80211_IFTYPE_P2P_DEVICE: /* * fall through, P2P device only supports * public action frames */ case NL80211_IFTYPE_NAN: default: err = -EOPNOTSUPP; break; } wdev_unlock(wdev); if (err) return err; } if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) { /* Allow random TA to be used with Public Action frames if the * driver has indicated support for this. Otherwise, only allow * the local address to be used. */ if (!ieee80211_is_action(mgmt->frame_control) || mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) return -EINVAL; if (!wdev->current_bss && !wiphy_ext_feature_isset( &rdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA)) return -EINVAL; if (wdev->current_bss && !wiphy_ext_feature_isset( &rdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED)) return -EINVAL; } /* Transmit the Action frame as requested by user space */ return rdev_mgmt_tx(rdev, wdev, params, cookie); } bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, const u8 *buf, size_t len, u32 flags) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg; const struct ieee80211_txrx_stypes *stypes = &wiphy->mgmt_stypes[wdev->iftype]; struct ieee80211_mgmt *mgmt = (void *)buf; const u8 *data; int data_len; bool result = false; __le16 ftype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE); u16 stype; trace_cfg80211_rx_mgmt(wdev, freq, sig_dbm); stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4; if (!(stypes->rx & BIT(stype))) { trace_cfg80211_return_bool(false); return false; } data = buf + ieee80211_hdrlen(mgmt->frame_control); data_len = len - ieee80211_hdrlen(mgmt->frame_control); spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { if (reg->frame_type != ftype) continue; if (reg->match_len > data_len) continue; if (memcmp(reg->match, data, reg->match_len)) continue; /* found match! */ /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, freq, sig_dbm, buf, len, flags, GFP_ATOMIC)) continue; result = true; break; } spin_unlock_bh(&rdev->mgmt_registrations_lock); trace_cfg80211_return_bool(result); return result; } EXPORT_SYMBOL(cfg80211_rx_mgmt_khz); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) { cancel_delayed_work(&rdev->dfs_update_channels_wk); queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, 0); } void cfg80211_dfs_channels_update_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct cfg80211_registered_device *rdev; struct cfg80211_chan_def chandef; struct ieee80211_supported_band *sband; struct ieee80211_channel *c; struct wiphy *wiphy; bool check_again = false; unsigned long timeout, next_time = 0; unsigned long time_dfs_update; enum nl80211_radar_event radar_event; int bandid, i; rdev = container_of(delayed_work, struct cfg80211_registered_device, dfs_update_channels_wk); wiphy = &rdev->wiphy; rtnl_lock(); for (bandid = 0; bandid < NUM_NL80211_BANDS; bandid++) { sband = wiphy->bands[bandid]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { c = &sband->channels[i]; if (!(c->flags & IEEE80211_CHAN_RADAR)) continue; if (c->dfs_state != NL80211_DFS_UNAVAILABLE && c->dfs_state != NL80211_DFS_AVAILABLE) continue; if (c->dfs_state == NL80211_DFS_UNAVAILABLE) { time_dfs_update = IEEE80211_DFS_MIN_NOP_TIME_MS; radar_event = NL80211_RADAR_NOP_FINISHED; } else { if (regulatory_pre_cac_allowed(wiphy) || cfg80211_any_wiphy_oper_chan(wiphy, c)) continue; time_dfs_update = REG_PRE_CAC_EXPIRY_GRACE_MS; radar_event = NL80211_RADAR_PRE_CAC_EXPIRED; } timeout = c->dfs_state_entered + msecs_to_jiffies(time_dfs_update); if (time_after_eq(jiffies, timeout)) { c->dfs_state = NL80211_DFS_USABLE; c->dfs_state_entered = jiffies; cfg80211_chandef_create(&chandef, c, NL80211_CHAN_NO_HT); nl80211_radar_notify(rdev, &chandef, radar_event, NULL, GFP_ATOMIC); regulatory_propagate_dfs_state(wiphy, &chandef, c->dfs_state, radar_event); continue; } if (!check_again) next_time = timeout - jiffies; else next_time = min(next_time, timeout - jiffies); check_again = true; } } rtnl_unlock(); /* reschedule if there are other channels waiting to be cleared again */ if (check_again) queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, next_time); } void cfg80211_radar_event(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_radar_event(wiphy, chandef); /* only set the chandef supplied channel to unavailable, in * case the radar is detected on only one of multiple channels * spanned by the chandef. */ cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE); cfg80211_sched_dfs_chan_update(rdev); nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp); memcpy(&rdev->radar_chandef, chandef, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk); } EXPORT_SYMBOL(cfg80211_radar_event); void cfg80211_cac_event(struct net_device *netdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event, gfp_t gfp) { struct wireless_dev *wdev = netdev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long timeout; trace_cfg80211_cac_event(netdev, event); if (WARN_ON(!wdev->cac_started && event != NL80211_RADAR_CAC_STARTED)) return; if (WARN_ON(!wdev->chandef.chan)) return; switch (event) { case NL80211_RADAR_CAC_FINISHED: timeout = wdev->cac_start_time + msecs_to_jiffies(wdev->cac_time_ms); WARN_ON(!time_after_eq(jiffies, timeout)); cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); memcpy(&rdev->cac_done_chandef, chandef, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); fallthrough; case NL80211_RADAR_CAC_ABORTED: wdev->cac_started = false; break; case NL80211_RADAR_CAC_STARTED: wdev->cac_started = true; break; default: WARN_ON(1); return; } nl80211_radar_notify(rdev, chandef, event, netdev, gfp); } EXPORT_SYMBOL(cfg80211_cac_event);
564 564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 // SPDX-License-Identifier: GPL-2.0 /* * xfrm4_policy.c * * Changes: * Kazunori MIYAZAWA @USAGI * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * */ #include <linux/err.h> #include <linux/kernel.h> #include <linux/inetdevice.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/l3mdev.h> static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, u32 mark) { struct rtable *rt; memset(fl4, 0, sizeof(*fl4)); fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); fl4->flowi4_mark = mark; if (saddr) fl4->saddr = saddr->a4; fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; return ERR_CAST(rt); } static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, u32 mark) { struct flowi4 fl4; return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark); } static int xfrm4_get_saddr(struct net *net, int oif, xfrm_address_t *saddr, xfrm_address_t *daddr, u32 mark) { struct dst_entry *dst; struct flowi4 fl4; dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; saddr->a4 = fl4.saddr; dst_release(dst); return 0; } static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { struct rtable *rt = (struct rtable *)xdst->route; const struct flowi4 *fl4 = &fl->u.ip4; xdst->u.rt.rt_iif = fl4->flowi4_iif; xdst->u.dst.dev = dev; dev_hold(dev); /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt.rt_is_input = rt->rt_is_input; xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) xdst->u.rt.rt_gw4 = rt->rt_gw4; else if (rt->rt_gw_family == AF_INET6) xdst->u.rt.rt_gw6 = rt->rt_gw6; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); rt_add_uncached_list(&xdst->u.rt); return 0; } static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->redirect(path, sk, skb); } static void xfrm4_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); if (xdst->u.rt.rt_uncached_list) rt_del_uncached_list(&xdst->u.rt); xfrm_dst_destroy(xdst); } static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int unregister) { if (!unregister) return; xfrm_dst_ifdown(dst, dev); } static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, .update_pmtu = xfrm4_update_pmtu, .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_ops = &xfrm4_dst_ops_template, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .fill_dst = xfrm4_fill_dst, .blackhole_route = ipv4_blackhole_route, }; #ifdef CONFIG_SYSCTL static struct ctl_table xfrm4_policy_table[] = { { .procname = "xfrm4_gc_thresh", .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; static __net_init int xfrm4_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = xfrm4_policy_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh; } hdr = register_net_sysctl(net, "net/ipv4", table); if (!hdr) goto err_reg; net->ipv4.xfrm4_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static __net_exit void xfrm4_net_sysctl_exit(struct net *net) { struct ctl_table *table; if (!net->ipv4.xfrm4_hdr) return; table = net->ipv4.xfrm4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.xfrm4_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else /* CONFIG_SYSCTL */ static inline int xfrm4_net_sysctl_init(struct net *net) { return 0; } static inline void xfrm4_net_sysctl_exit(struct net *net) { } #endif static int __net_init xfrm4_net_init(struct net *net) { int ret; memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template, sizeof(xfrm4_dst_ops_template)); ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops); if (ret) return ret; ret = xfrm4_net_sysctl_init(net); if (ret) dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); return ret; } static void __net_exit xfrm4_net_exit(struct net *net) { xfrm4_net_sysctl_exit(net); dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); } static struct pernet_operations __net_initdata xfrm4_net_ops = { .init = xfrm4_net_init, .exit = xfrm4_net_exit, }; static void __init xfrm4_policy_init(void) { xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET); } void __init xfrm4_init(void) { xfrm4_state_init(); xfrm4_policy_init(); xfrm4_protocol_init(); register_pernet_subsys(&xfrm4_net_ops); }
31 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 /* * Cryptographic API. * * Glue code for the SHA512 Secure Hash Algorithm assembler * implementation using supplemental SSE3 / AVX / AVX2 instructions. * * This file is based on sha512_generic.c * * Copyright (C) 2013 Intel Corporation * Author: Tim Chen <tim.c.chen@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> #include <asm/simd.h> asmlinkage void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, int blocks); static int sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha512_block_fn *sha512_xform) { struct sha512_state *sctx = shash_desc_ctx(desc); if (!crypto_simd_usable() || (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) return crypto_sha512_update(desc, data, len); /* * Make sure struct sha512_state begins directly with the SHA512 * 512-bit internal state, as this is what the asm functions expect. */ BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); kernel_fpu_begin(); sha512_base_do_update(desc, data, len, sha512_xform); kernel_fpu_end(); return 0; } static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha512_block_fn *sha512_xform) { if (!crypto_simd_usable()) return crypto_sha512_finup(desc, data, len, out); kernel_fpu_begin(); if (len) sha512_base_do_update(desc, data, len, sha512_xform); sha512_base_do_finalize(desc, sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); } static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_ssse3); } static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_ssse3); } /* Add padding and return the message digest. */ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) { return sha512_ssse3_finup(desc, NULL, 0, out); } static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", .cra_priority = 150, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", .cra_priority = 150, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int register_sha512_ssse3(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) return crypto_register_shashes(sha512_ssse3_algs, ARRAY_SIZE(sha512_ssse3_algs)); return 0; } static void unregister_sha512_ssse3(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) crypto_unregister_shashes(sha512_ssse3_algs, ARRAY_SIZE(sha512_ssse3_algs)); } asmlinkage void sha512_transform_avx(struct sha512_state *state, const u8 *data, int blocks); static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } return true; } static int sha512_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_avx); } static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_avx); } /* Add padding and return the message digest. */ static int sha512_avx_final(struct shash_desc *desc, u8 *out) { return sha512_avx_finup(desc, NULL, 0, out); } static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx_update, .final = sha512_avx_final, .finup = sha512_avx_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx", .cra_priority = 160, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx_update, .final = sha512_avx_final, .finup = sha512_avx_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx", .cra_priority = 160, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int register_sha512_avx(void) { if (avx_usable()) return crypto_register_shashes(sha512_avx_algs, ARRAY_SIZE(sha512_avx_algs)); return 0; } static void unregister_sha512_avx(void) { if (avx_usable()) crypto_unregister_shashes(sha512_avx_algs, ARRAY_SIZE(sha512_avx_algs)); } asmlinkage void sha512_transform_rorx(struct sha512_state *state, const u8 *data, int blocks); static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_rorx); } static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_rorx); } /* Add padding and return the message digest. */ static int sha512_avx2_final(struct shash_desc *desc, u8 *out) { return sha512_avx2_finup(desc, NULL, 0, out); } static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx2_update, .final = sha512_avx2_final, .finup = sha512_avx2_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx2", .cra_priority = 170, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx2_update, .final = sha512_avx2_final, .finup = sha512_avx2_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx2", .cra_priority = 170, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static bool avx2_usable(void) { if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) return true; return false; } static int register_sha512_avx2(void) { if (avx2_usable()) return crypto_register_shashes(sha512_avx2_algs, ARRAY_SIZE(sha512_avx2_algs)); return 0; } static void unregister_sha512_avx2(void) { if (avx2_usable()) crypto_unregister_shashes(sha512_avx2_algs, ARRAY_SIZE(sha512_avx2_algs)); } static int __init sha512_ssse3_mod_init(void) { if (register_sha512_ssse3()) goto fail; if (register_sha512_avx()) { unregister_sha512_ssse3(); goto fail; } if (register_sha512_avx2()) { unregister_sha512_avx(); unregister_sha512_ssse3(); goto fail; } return 0; fail: return -ENODEV; } static void __exit sha512_ssse3_mod_fini(void) { unregister_sha512_avx2(); unregister_sha512_avx(); unregister_sha512_ssse3(); } module_init(sha512_ssse3_mod_init); module_exit(sha512_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha512"); MODULE_ALIAS_CRYPTO("sha512-ssse3"); MODULE_ALIAS_CRYPTO("sha512-avx"); MODULE_ALIAS_CRYPTO("sha512-avx2"); MODULE_ALIAS_CRYPTO("sha384"); MODULE_ALIAS_CRYPTO("sha384-ssse3"); MODULE_ALIAS_CRYPTO("sha384-avx"); MODULE_ALIAS_CRYPTO("sha384-avx2");
564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_pedit.c Generic packet editor * * Authors: Jamal Hadi Salim (2002-4) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_pedit.h> #include <net/tc_act/tc_pedit.h> #include <uapi/linux/tc_act/tc_pedit.h> #include <net/pkt_cls.h> static unsigned int pedit_net_id; static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED }, }; static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = { [TCA_PEDIT_KEY_EX_HTYPE] = { .type = NLA_U16 }, [TCA_PEDIT_KEY_EX_CMD] = { .type = NLA_U16 }, }; static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, u8 n) { struct tcf_pedit_key_ex *keys_ex; struct tcf_pedit_key_ex *k; const struct nlattr *ka; int err = -EINVAL; int rem; if (!nla) return NULL; keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); if (!keys_ex) return ERR_PTR(-ENOMEM); k = keys_ex; nla_for_each_nested(ka, nla, rem) { struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1]; if (!n) { err = -EINVAL; goto err_out; } n--; if (nla_type(ka) != TCA_PEDIT_KEY_EX) { err = -EINVAL; goto err_out; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_KEY_EX_MAX, ka, pedit_key_ex_policy, NULL); if (err) goto err_out; if (!tb[TCA_PEDIT_KEY_EX_HTYPE] || !tb[TCA_PEDIT_KEY_EX_CMD]) { err = -EINVAL; goto err_out; } k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]); k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]); if (k->htype > TCA_PEDIT_HDR_TYPE_MAX || k->cmd > TCA_PEDIT_CMD_MAX) { err = -EINVAL; goto err_out; } k++; } if (n) { err = -EINVAL; goto err_out; } return keys_ex; err_out: kfree(keys_ex); return ERR_PTR(err); } static int tcf_pedit_key_ex_dump(struct sk_buff *skb, struct tcf_pedit_key_ex *keys_ex, int n) { struct nlattr *keys_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEYS_EX); if (!keys_start) goto nla_failure; for (; n > 0; n--) { struct nlattr *key_start; key_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEY_EX); if (!key_start) goto nla_failure; if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) || nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) goto nla_failure; nla_nest_end(skb, key_start); keys_ex++; } nla_nest_end(skb, keys_start); return 0; nla_failure: nla_nest_cancel(skb, keys_start); return -EINVAL; } static void tcf_pedit_cleanup_rcu(struct rcu_head *head) { struct tcf_pedit_parms *parms = container_of(head, struct tcf_pedit_parms, rcu); kfree(parms->tcfp_keys_ex); kfree(parms->tcfp_keys); kfree(parms); } static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_pedit_parms *oparms, *nparms; struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; struct nlattr *pattr; struct tcf_pedit *p; int ret = 0, err; int i, ksize; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) return err; pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; if (!pattr) { NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; } parm = nla_data(pattr); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) return 0; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; } } else { return err; } if (!parm->nkeys) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); ret = -EINVAL; goto out_release; } ksize = parm->nkeys * sizeof(struct tc_pedit_key); if (nla_len(pattr) < sizeof(*parm) + ksize) { NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); ret = -EINVAL; goto out_release; } nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); if (!nparms) { ret = -ENOMEM; goto out_release; } nparms->tcfp_keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); if (IS_ERR(nparms->tcfp_keys_ex)) { ret = PTR_ERR(nparms->tcfp_keys_ex); goto out_free; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) { ret = err; goto out_free_ex; } nparms->tcfp_off_max_hint = 0; nparms->tcfp_flags = parm->flags; nparms->tcfp_nkeys = parm->nkeys; nparms->tcfp_keys = kmalloc(ksize, GFP_KERNEL); if (!nparms->tcfp_keys) { ret = -ENOMEM; goto put_chain; } memcpy(nparms->tcfp_keys, parm->keys, ksize); for (i = 0; i < nparms->tcfp_nkeys; ++i) { u32 cur = nparms->tcfp_keys[i].off; /* sanitize the shift value for any later use */ nparms->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, nparms->tcfp_keys[i].shift); /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ cur += (0xff & nparms->tcfp_keys[i].offmask) >> nparms->tcfp_keys[i].shift; /* Each key touches 4 bytes starting from the computed offset */ nparms->tcfp_off_max_hint = max(nparms->tcfp_off_max_hint, cur + 4); } p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); spin_unlock_bh(&p->tcf_lock); if (oparms) call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); out_free_ex: kfree(nparms->tcfp_keys_ex); out_free: kfree(nparms); out_release: tcf_idr_release(*a, bind); return ret; } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; parms = rcu_dereference_protected(p->parms, 1); if (parms) call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } static bool offset_valid(struct sk_buff *skb, int offset) { if (offset > 0 && offset > skb->len) return false; if (offset < 0 && -offset > skb_headroom(skb)) return false; return true; } static int pedit_skb_hdr_offset(struct sk_buff *skb, enum pedit_header_type htype, int *hoffset) { int ret = -EINVAL; switch (htype) { case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH: if (skb_mac_header_was_set(skb)) { *hoffset = skb_mac_offset(skb); ret = 0; } break; case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6: *hoffset = skb_network_offset(skb); ret = 0; break; case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP: case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP: if (skb_transport_header_was_set(skb)) { *hoffset = skb_transport_offset(skb); ret = 0; } break; default: ret = -EINVAL; break; } return ret; } static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; u32 max_offset; int i; parms = rcu_dereference_bh(p->parms); max_offset = (skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : skb_network_offset(skb)) + parms->tcfp_off_max_hint; if (skb_ensure_writable(skb, min(skb->len, max_offset))) goto done; tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); if (parms->tcfp_nkeys > 0) { struct tc_pedit_key *tkey = parms->tcfp_keys; struct tcf_pedit_key_ex *tkey_ex = parms->tcfp_keys_ex; enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { u32 *ptr, hdata; int offset = tkey->off; int hoffset; u32 val; int rc; if (tkey_ex) { htype = tkey_ex->htype; cmd = tkey_ex->cmd; tkey_ex++; } rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { pr_info("tc action pedit bad header type specified (0x%x)\n", htype); goto bad; } if (tkey->offmask) { u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { pr_info("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } d = skb_header_pointer(skb, hoffset + tkey->at, sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; } if (offset % 4) { pr_info("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } if (!offset_valid(skb, hoffset + offset)) { pr_info("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } ptr = skb_header_pointer(skb, hoffset + offset, sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: val = tkey->val; break; case TCA_PEDIT_KEY_EX_CMD_ADD: val = (*ptr + tkey->val) & ~tkey->mask; break; default: pr_info("tc action pedit bad command (%d)\n", cmd); goto bad; } *ptr = ((*ptr & tkey->mask) ^ val); if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } goto done; } else { WARN(1, "pedit BUG: index %d\n", p->tcf_index); } bad: spin_lock(&p->tcf_lock); p->tcf_qstats.overlimits++; spin_unlock(&p->tcf_lock); done: return p->tcf_action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_pedit *d = to_pedit(a); struct tcf_t *tm = &d->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; spin_lock_bh(&p->tcf_lock); parms = rcu_dereference_protected(p->parms, 1); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; } memcpy(opt->keys, parms->tcfp_keys, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->nkeys = parms->tcfp_nkeys; opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (parms->tcfp_keys_ex) { if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex, parms->tcfp_nkeys)) goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) goto nla_put_failure; } else { if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) goto nla_put_failure; } tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; spin_unlock_bh(&p->tcf_lock); kfree(opt); return skb->len; nla_put_failure: spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); kfree(opt); return -1; } static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, pedit_net_id); return tcf_idr_search(tn, a, index); } static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, .stats_update = tcf_pedit_stats_update, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, .size = sizeof(struct tcf_pedit), }; static __net_init int pedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, pedit_net_id); return tc_action_net_init(net, tn, &act_pedit_ops); } static void __net_exit pedit_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, pedit_net_id); } static struct pernet_operations pedit_net_ops = { .init = pedit_init_net, .exit_batch = pedit_exit_net, .id = &pedit_net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); MODULE_DESCRIPTION("Generic Packet Editor actions"); MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { return tcf_register_action(&act_pedit_ops, &pedit_net_ops); } static void __exit pedit_cleanup_module(void) { tcf_unregister_action(&act_pedit_ops, &pedit_net_ops); } module_init(pedit_init_module); module_exit(pedit_cleanup_module);
610 610 794 615 609 610 167 167 191 17 1084 1054 795 167 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) failover_slave_register(dev); } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); failover = kzalloc(sizeof(*failover), GFP_KERNEL); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); dev_hold(dev); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; dev_put(failover_dev); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xdp #if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_XDP_H #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/tracepoint.h> #include <linux/bpf.h> #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ FN(DROP) \ FN(PASS) \ FN(TX) \ FN(REDIRECT) #define __XDP_ACT_TP_FN(x) \ TRACE_DEFINE_ENUM(XDP_##x); #define __XDP_ACT_SYM_FN(x) \ { XDP_##x, #x }, #define __XDP_ACT_SYM_TAB \ __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL } __XDP_ACT_MAP(__XDP_ACT_TP_FN) TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), TP_ARGS(dev, xdp, act), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) ), TP_fast_assign( __entry->prog_id = xdp->aux->id; __entry->act = act; __entry->ifindex = dev->ifindex; ), TP_printk("prog_id=%d action=%s ifindex=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex) ); TRACE_EVENT(xdp_bulk_tx, TP_PROTO(const struct net_device *dev, int sent, int drops, int err), TP_ARGS(dev, sent, drops, err), TP_STRUCT__entry( __field(int, ifindex) __field(u32, act) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->ifindex = dev->ifindex; __entry->act = XDP_TX; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", __entry->ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); #ifndef __DEVMAP_OBJ_TYPE #define __DEVMAP_OBJ_TYPE struct _bpf_dtab_netdev { struct net_device *dev; }; #endif /* __DEVMAP_OBJ_TYPE */ DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) __field(int, err) __field(int, to_ifindex) __field(u32, map_id) __field(int, map_index) ), TP_fast_assign( u32 ifindex = 0, map_index = index; if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Just leave to_ifindex to 0 if do broadcast redirect, * as tgt will be NULL. */ if (tgt) ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { ifindex = index; map_index = 0; } __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; __entry->to_ifindex = ifindex; __entry->map_id = map_id; __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_err(dev, xdp, to, err) \ trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) #define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats), TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) __field(unsigned int, xdp_pass) __field(unsigned int, xdp_drop) __field(unsigned int, xdp_redirect) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; __entry->xdp_pass = xdp_stats->pass; __entry->xdp_drop = xdp_stats->drop; __entry->xdp_redirect = xdp_stats->redirect; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " sched=%d" " xdp_pass=%u xdp_drop=%u xdp_redirect=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->sched, __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect) ); TRACE_EVENT(xdp_cpumap_enqueue, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int to_cpu), TP_ARGS(map_id, processed, drops, to_cpu), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, to_cpu) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->to_cpu = to_cpu; ), TP_printk("enqueue" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " to_cpu=%d", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->to_cpu) ); TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct net_device *from_dev, const struct net_device *to_dev, int sent, int drops, int err), TP_ARGS(from_dev, to_dev, sent, drops, err), TP_STRUCT__entry( __field(int, from_ifindex) __field(u32, act) __field(int, to_ifindex) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->from_ifindex = from_dev->ifindex; __entry->act = XDP_REDIRECT; __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ndo_xdp_xmit" " from_ifindex=%d to_ifindex=%d action=%s" " sent=%d drops=%d" " err=%d", __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); /* Expect users already include <net/xdp.h>, but not xdp_priv.h */ #include <net/xdp_priv.h> #define __MEM_TYPE_MAP(FN) \ FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); #define __MEM_TYPE_SYM_FN(x) \ { MEM_TYPE_##x, #x }, #define __MEM_TYPE_SYM_TAB \ __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 } __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, TP_PROTO(const struct xdp_mem_allocator *xa), TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; ), TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator ) ); TRACE_EVENT(mem_connect, TP_PROTO(const struct xdp_mem_allocator *xa, const struct xdp_rxq_info *rxq), TP_ARGS(xa, rxq), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) __field(const struct xdp_rxq_info *, rxq) __field(int, ifindex) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; __entry->rxq = rxq; __entry->ifindex = rxq->dev->ifindex; ), TP_printk("mem_id=%d mem_type=%s allocator=%p" " ifindex=%d", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator, __entry->ifindex ) ); TRACE_EVENT(mem_return_failed, TP_PROTO(const struct xdp_mem_info *mem, const struct page *page), TP_ARGS(mem, page), TP_STRUCT__entry( __field(const struct page *, page) __field(u32, mem_id) __field(u32, mem_type) ), TP_fast_assign( __entry->page = page; __entry->mem_id = mem->id; __entry->mem_type = mem->type; ), TP_printk("mem_id=%d mem_type=%s page=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->page ) ); #endif /* _TRACE_XDP_H */ #include <trace/define_trace.h>
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM dccp #if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_DCCP_H #include <net/sock.h> #include "dccp.h" #include "ccids/ccid3.h" #include <linux/tracepoint.h> #include <trace/events/net_probe_common.h> TRACE_EVENT(dccp_probe, TP_PROTO(struct sock *sk, size_t size), TP_ARGS(sk, size), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, size) __field(__u16, tx_s) __field(__u32, tx_rtt) __field(__u32, tx_p) __field(__u32, tx_x_calc) __field(__u64, tx_x_recv) __field(__u64, tx_x) __field(__u32, tx_t_ipi) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct ccid3_hc_tx_sock *hc = NULL; if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) hc = ccid3_hc_tx_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->size = size; if (hc) { __entry->tx_s = hc->tx_s; __entry->tx_rtt = hc->tx_rtt; __entry->tx_p = hc->tx_p; __entry->tx_x_calc = hc->tx_x_calc; __entry->tx_x_recv = hc->tx_x_recv >> 6; __entry->tx_x = hc->tx_x >> 6; __entry->tx_t_ipi = hc->tx_t_ipi; } else { __entry->tx_s = 0; memset(&__entry->tx_rtt, 0, (void *)&__entry->tx_t_ipi - (void *)&__entry->tx_rtt + sizeof(__entry->tx_t_ipi)); } ), TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", __entry->saddr, __entry->daddr, __entry->size, __entry->tx_s, __entry->tx_rtt, __entry->tx_p, __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, __entry->tx_t_ipi) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
63 62 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * * This file is part of the SCTP kernel implementation * * These are the state tables for the SCTP state machine. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Hui Huang <hui.huang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static const struct sctp_sm_table_entry primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( struct net *net, enum sctp_cid cid, enum sctp_state state); static const struct sctp_sm_table_entry bug = { .fn = sctp_sf_bug, .name = "sctp_sf_bug" }; #define DO_LOOKUP(_max, _type, _table) \ ({ \ const struct sctp_sm_table_entry *rtn; \ \ if ((event_subtype._type > (_max))) { \ pr_warn("table %p possible attack: event %d exceeds max %d\n", \ _table, event_subtype._type, _max); \ rtn = &bug; \ } else \ rtn = &_table[event_subtype._type][(int)state]; \ \ rtn; \ }) const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype) { switch (event_type) { case SCTP_EVENT_T_CHUNK: return sctp_chunk_event_lookup(net, event_subtype.chunk, state); case SCTP_EVENT_T_TIMEOUT: return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, timeout_event_table); case SCTP_EVENT_T_OTHER: return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table); case SCTP_EVENT_T_PRIMITIVE: return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, primitive_event_table); default: /* Yikes! We got an illegal event type. */ return &bug; } } #define TYPE_SCTP_FUNC(func) {.fn = func, .name = #func} #define TYPE_SCTP_DATA { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_fast_4_4), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_DATA */ #define TYPE_SCTP_INIT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1B_init), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_reshutack), \ } /* TYPE_SCTP_INIT */ #define TYPE_SCTP_INIT_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_3_initack), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1C_ack), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_INIT_ACK */ #define TYPE_SCTP_SACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_SACK */ #define TYPE_SCTP_HEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ /* This should not happen, but we are nice. */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ } /* TYPE_SCTP_HEARTBEAT */ #define TYPE_SCTP_HEARTBEAT_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_HEARTBEAT_ACK */ #define TYPE_SCTP_ABORT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_pdiscard), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_abort), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_abort), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_abort), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_abort), \ } /* TYPE_SCTP_ABORT */ #define TYPE_SCTP_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shut_ctsn), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_SHUTDOWN */ #define TYPE_SCTP_SHUTDOWN_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ } /* TYPE_SCTP_SHUTDOWN_ACK */ #define TYPE_SCTP_ERROR { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_err), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ERROR */ #define TYPE_SCTP_COOKIE_ECHO { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1D_ce), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ } /* TYPE_SCTP_COOKIE_ECHO */ #define TYPE_SCTP_COOKIE_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1E_ca), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_COOKIE_ACK */ #define TYPE_SCTP_ECN_ECNE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ECN_ECNE */ #define TYPE_SCTP_ECN_CWR { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ECN_CWR */ #define TYPE_SCTP_SHUTDOWN_COMPLETE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_4_C), \ } /* TYPE_SCTP_SHUTDOWN_COMPLETE */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. * * For base protocol (RFC 2960). */ static const struct sctp_sm_table_entry chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_DATA, TYPE_SCTP_INIT, TYPE_SCTP_INIT_ACK, TYPE_SCTP_SACK, TYPE_SCTP_HEARTBEAT, TYPE_SCTP_HEARTBEAT_ACK, TYPE_SCTP_ABORT, TYPE_SCTP_SHUTDOWN, TYPE_SCTP_SHUTDOWN_ACK, TYPE_SCTP_ERROR, TYPE_SCTP_COOKIE_ECHO, TYPE_SCTP_COOKIE_ACK, TYPE_SCTP_ECN_ECNE, TYPE_SCTP_ECN_CWR, TYPE_SCTP_SHUTDOWN_COMPLETE, }; /* state_fn_t chunk_event_table[][] */ #define TYPE_SCTP_ASCONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF */ #define TYPE_SCTP_ASCONF_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF_ACK */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_ASCONF, TYPE_SCTP_ASCONF_ACK, }; /*state_fn_t addip_chunk_event_table[][] */ #define TYPE_SCTP_FWD_TSN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn_fast), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_FWD_TSN */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_FWD_TSN, }; /*state_fn_t prsctp_chunk_event_table[][] */ #define TYPE_SCTP_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_RECONF */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_RECONF, }; /*state_fn_t reconf_chunk_event_table[][] */ #define TYPE_SCTP_AUTH { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ } /* TYPE_SCTP_AUTH */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_AUTH, }; /*state_fn_t auth_chunk_event_table[][] */ static const struct sctp_sm_table_entry pad_chunk_event_table[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_COOKIE_WAIT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_COOKIE_ECHOED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_ESTABLISHED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_PENDING */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_SENT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_RECEIVED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_ACK_SENT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), }; /* chunk pad */ static const struct sctp_sm_table_entry chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ TYPE_SCTP_FUNC(sctp_sf_ootb), /* SCTP_STATE_COOKIE_WAIT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_COOKIE_ECHOED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_ESTABLISHED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_PENDING */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_SENT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_RECEIVED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_ACK_SENT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), }; /* chunk unknown */ #define TYPE_SCTP_PRIMITIVE_ASSOCIATE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asoc), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ } /* TYPE_SCTP_PRIMITIVE_ASSOCIATE */ #define TYPE_SCTP_PRIMITIVE_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_shutdown), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_shutdown),\ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_prm_shutdown), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ } /* TYPE_SCTP_PRIMITIVE_SHUTDOWN */ #define TYPE_SCTP_PRIMITIVE_ABORT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_abort), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_prm_abort), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_prm_abort), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_prm_abort), \ } /* TYPE_SCTP_PRIMITIVE_ABORT */ #define TYPE_SCTP_PRIMITIVE_SEND { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_SEND */ #define TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ } /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */ #define TYPE_SCTP_PRIMITIVE_ASCONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_ASCONF */ #define TYPE_SCTP_PRIMITIVE_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_RECONF */ /* The primary index for this table is the primitive type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_PRIMITIVE_ASSOCIATE, TYPE_SCTP_PRIMITIVE_SHUTDOWN, TYPE_SCTP_PRIMITIVE_ABORT, TYPE_SCTP_PRIMITIVE_SEND, TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT, TYPE_SCTP_PRIMITIVE_ASCONF, TYPE_SCTP_PRIMITIVE_RECONF, }; #define TYPE_SCTP_OTHER_NO_PENDING_TSN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_no_pending_tsn), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_start_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ } #define TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_icmp_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ } static const struct sctp_sm_table_entry other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_OTHER_NO_PENDING_TSN, TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH, }; #define TYPE_SCTP_EVENT_TIMEOUT_NONE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_t1_cookie_timer_expire), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T1_INIT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_t1_init_timer_expire), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T3_RTX { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T4_RTO { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_t4_timer_expire), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_SACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_autoclose_timer_expire), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_send_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_PROBE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_send_probe), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_EVENT_TIMEOUT_NONE, TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE, TYPE_SCTP_EVENT_TIMEOUT_T1_INIT, TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN, TYPE_SCTP_EVENT_TIMEOUT_T3_RTX, TYPE_SCTP_EVENT_TIMEOUT_T4_RTO, TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, TYPE_SCTP_EVENT_TIMEOUT_RECONF, TYPE_SCTP_EVENT_TIMEOUT_PROBE, TYPE_SCTP_EVENT_TIMEOUT_SACK, TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, }; static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( struct net *net, enum sctp_cid cid, enum sctp_state state) { if (state > SCTP_STATE_MAX) return &bug; if (cid == SCTP_CID_I_DATA) cid = SCTP_CID_DATA; if (cid <= SCTP_CID_BASE_MAX) return &chunk_event_table[cid][state]; switch ((u16)cid) { case SCTP_CID_FWD_TSN: case SCTP_CID_I_FWD_TSN: return &prsctp_chunk_event_table[0][state]; case SCTP_CID_ASCONF: return &addip_chunk_event_table[0][state]; case SCTP_CID_ASCONF_ACK: return &addip_chunk_event_table[1][state]; case SCTP_CID_RECONF: return &reconf_chunk_event_table[0][state]; case SCTP_CID_AUTH: return &auth_chunk_event_table[0][state]; case SCTP_CID_PAD: return &pad_chunk_event_table[state]; } return &chunk_event_table_unknown[state]; }
9 9 37 19 3 1 1 1 19 17 19 17 2 3 12 21 5 18 2 1 50 50 50 7 25 21 5 20 1 1 1 1 1 1 22 22 31 41 38 3 1 40 4 55 55 55 33 42 4 38 38 1 3 31 19 11 18 19 18 19 4 20 9 4 7 21 16 1 3 6 6 7 1 25 9 8 54 6 6 3 4 1 1 1 6 6 6 5 6 6 3 3 6 1 2 4 4 1 6 6 45 46 41 17 17 39 4 4 4 4 4 25 21 4 4 25 24 21 4 4 21 19 64 64 64 4 4 4 4 4 4 4 4 1 1 3 1 5 1 17 17 17 16 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> * * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset * Alan Cox : Stopped it crashing if it closed while * sk->inuse=1 and was trying to connect * (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It * wakes people on errors. poll * behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_send_reset() fixed to work for * everything not just packets for * unknown sockets. * Alan Cox : tcp option processing. * Alan Cox : Reset tweaked (still not 100%) [Had * syn rule wrong] * Herp Rosmanith : More reset fixes * Alan Cox : No longer acks invalid rst frames. * Acking any kind of RST is right out. * Alan Cox : Sets an ignore me flag on an rst * receive otherwise odd bits of prattle * escape still * Alan Cox : Fixed another acking RST frame bug. * Should stop LAN workplace lockups. * Alan Cox : Some tidyups using the new skb list * facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors * Alan Cox : PSH doesn't end a TCP read. Switched a * bit to skb ops. * Alan Cox : Tidied tcp_data to avoid a potential * nasty. * Alan Cox : Added some better commenting, as the * tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH * Alan Cox : Added socket option proto entries. * Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) * Alan Cox : Switched wakeup calls to callbacks, * so the kernel can layer network * sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). * Alan Cox : RST frames sent on unsynchronised * state ack error. * Alan Cox : Put in missing check for SYN bit. * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. * Alan Cox : Added a couple of small NET2E timer * fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * Alan Cox : Small URG fix to rlogin ^C ack fight * Charles Hedrick : Rewrote most of it to actually work * Linus : Rewrote tcp_read() and URG handling * completely * Gerhard Koerting: Fixed some missing timer handling * Matthew Dillon : Reworked TCP machine states as per RFC * Gerhard Koerting: PC/TCP workarounds * Adam Caldwell : Assorted timer/timing errors * Matthew Dillon : Fixed another RST bug * Alan Cox : Move to kernel side addressing changes. * Alan Cox : Beginning work on TCP fastpathing * (not yet usable) * Arnt Gulbrandsen: Turbocharged tcp_check() routine. * Alan Cox : TCP fast path debugging * Alan Cox : Window clamping * Michael Riepe : Bug in tcp_check() * Matt Dillon : More TCP improvements and RST bug fixes * Matt Dillon : Yet more small nasties remove from the * TCP code (Be very nice to this man if * tcp finally works 100%) 8) * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). * Michael Pall : Handle poll() after URG properly in * all cases. * Michael Pall : Undo the last fix in tcp_read_urg() * (multi URG PUSH broke rlogin). * Michael Pall : Fix the multi URG PUSH problem in * tcp_readable(), poll() after URG * works now. * Michael Pall : recv(...,MSG_OOB) never blocks in the * BSD api. * Alan Cox : Changed the semantics of sk->socket to * fix a race and a signal problem with * accept() and async I/O. * Alan Cox : Relaxed the rules on tcp_sendto(). * Yury Shevchuk : Really fixed accept() blocking problem. * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for * clients/servers which listen in on * fixed ports. * Alan Cox : Cleaned the above up and shrank it to * a sensible code size. * Alan Cox : Self connect lockup fix. * Alan Cox : No connect to multicast. * Ross Biro : Close unaccepted children on master * socket close. * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error * Alan Cox : Small whoops in polling before an * accept. * Alan Cox : Kept the state trace facility since * it's handy for debugging. * Alan Cox : More reset handler fixes. * Alan Cox : Started rewriting the code based on * the RFC's for other useful protocol * references see: Comer, KA9Q NOS, and * for a reference on the difference * between specifications and how BSD * works see the 4.4lite source. * A.N.Kuznetsov : Don't time wait on completion of tidy * close. * Linus Torvalds : Fin/Shutdown & copied_seq changes. * Linus Torvalds : Fixed BSD port reuse to work first syn * Alan Cox : Reimplemented timers as per the RFC * and using multiple timers for sanity. * Alan Cox : Small bug fixes, and a lot of new * comments. * Alan Cox : Fixed dual reader crash by locking * the buffers (much like datagram.c) * Alan Cox : Fixed stuck sockets in probe. A probe * now gets fed up of retrying without * (even a no space) answer. * Alan Cox : Extracted closing code better * Alan Cox : Fixed the closing state machine to * resemble the RFC. * Alan Cox : More 'per spec' fixes. * Jorge Cwik : Even faster checksumming. * Alan Cox : tcp_data() doesn't ack illegal PSH * only frames. At least one pc tcp stack * generates them. * Alan Cox : Cache last socket. * Alan Cox : Per route irtt. * Matt Day : poll()->select() match BSD precisely on error * Alan Cox : New buffers * Marc Tamsky : Various sk->prot->retransmits and * sk->retransmits misupdating fixed. * Fixed tcp_write_timeout: stuck close, * and TCP syn retries gets used now. * Mark Yarvis : In tcp_read_wakeup(), don't send an * ack if state is TCP_CLOSED. * Alan Cox : Look up device on a retransmit - routes may * change. Doesn't yet cope with MSS shrink right * but it's a start! * Marc Tamsky : Closing in closing fixes. * Mike Shaver : RFC1122 verifications. * Alan Cox : rcv_saddr errors. * Alan Cox : Block double connect(). * Alan Cox : Small hooks for enSKIP. * Alexey Kuznetsov: Path MTU discovery. * Alan Cox : Support soft errors. * Alan Cox : Fix MTU discovery pathological case * when the remote claims no mtu! * Marc Tamsky : TCP_CLOSE fix. * Colin (G3TNE) : Send a reset on syn ack replies in * window but wrong (fixes NT lpd problems) * Pedro Roque : Better TCP window handling, delayed ack. * Joerg Reuter : No modification of locked buffers in * tcp_do_retransmit() * Eric Schenk : Changed receiver side silly window * avoidance algorithm to BSD style * algorithm. This doubles throughput * against machines running Solaris, * and seems to result in general * improvement. * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * Keith Owens : Do proper merging with partial SKB's in * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). * Andi Kleen : Make poll agree with SIGIO * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and * lingertime == 0 (RFC 793 ABORT Call) * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack * * TCP_SYN_RECV received a connection request, sent ack, * waiting for final ack in three-way handshake. * * TCP_ESTABLISHED connection established * * TCP_FIN_WAIT1 our side has shutdown, waiting to complete * transmission of remaining buffered data * * TCP_FIN_WAIT2 all buffered data sent, waiting for remote * to shutdown * * TCP_CLOSING both sides have shutdown but we still have * data we have to finish sending * * TCP_TIME_WAIT timeout to catch resent junk before entering * closed, can only be entered from FIN_WAIT2 * or CLOSING. Required because the other end * may not have gotten our last ACK causing it * to retransmit the data packet (which we ignore) * * TCP_CLOSE_WAIT remote side has shutdown and is waiting for * us to finish writing our data and to shutdown * (we have to close() to move on to LAST_ACK) * * TCP_LAST_ACK out side has shutdown after remote has * shutdown. There may still be data in our * buffer that we have to finish sending * * TCP_CLOSE socket is finished */ #define pr_fmt(fmt) "TCP: " fmt #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> #include <linux/scatterlist.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/random.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/swap.h> #include <linux/cache.h> #include <linux/err.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/static_key.h> #include <linux/btf.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> /* Track pending CMSGs. */ enum { TCP_CMSG_INQ = 1, TCP_CMSG_TS = 2 }; DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); #if IS_ENABLED(CONFIG_SMC) DEFINE_STATIC_KEY_FALSE(tcp_have_smc); EXPORT_SYMBOL(tcp_have_smc); #endif /* * Current number of TCP sockets. */ struct percpu_counter tcp_sockets_allocated; EXPORT_SYMBOL(tcp_sockets_allocated); /* * TCP splice context */ struct tcp_splice_state { struct pipe_inode_info *pipe; size_t len; unsigned int flags; }; /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); EXPORT_SYMBOL(tcp_rx_skb_cache_key); DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; if (!val) val--; if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); } EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure); void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, jiffies_to_msecs(jiffies - val)); } EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) { u8 res = 0; if (seconds > 0) { int period = timeout; res = 1; while (seconds > period && res < 255) { res++; timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return res; } /* Convert retransmits to seconds based on initial and max timeout */ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) { int period = 0; if (retrans > 0) { period = timeout; while (--retrans) { timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return period; } static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) { u32 rate = READ_ONCE(tp->rate_delivered); u32 intv = READ_ONCE(tp->rate_interval_us); u64 rate64 = 0; if (rate && intv) { rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; do_div(rate64, intv); } return rate64; } /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tcp_snd_cwnd_set(tp, TCP_INIT_CWND); /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_sync_mss = tcp_sync_mss; WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); sk_sockets_allocated_inc(sk); sk->sk_route_forced_caps = NETIF_F_GSO; } EXPORT_SYMBOL(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) tcb->txstamp_ack = 1; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } } static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) return true; return sk_is_readable(sk); } /* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events * by poll logic and correct handling of state changes * made by other threads is impossible in any case. */ mask = 0; /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a * socket the read side is more interesting. * * Some poll() documentation says that EPOLLHUP is incompatible * with the EPOLLOUT/POLLWR flags, so somebody should check this * all. But careful, it tends to be safer to return too many * bits than too few, and you can easily break real applications * if you don't tell them that something has hung up! * * Check-me. * * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and * our fs/select.c). It means that after we received EOF, * poll always returns immediately, making impossible poll() on write() * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX * solve this dilemma. I would prefer, if EPOLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains * why EPOLLHUP is incompatible with EPOLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE) && tp->urg_data) target++; if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. Memory barrier * pairs with the input side. */ smp_mb__after_atomic(); if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else mask |= EPOLLOUT | EPOLLWRNORM; if (tp->urg_data & TCP_URG_VALID) mask |= EPOLLPRI; } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() * in order for kernel to generate SYN+data */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; } EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { struct tcp_sock *tp = tcp_sk(sk); int answ; bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; slow = lock_sock_fast(sk); answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: answ = tp->urg_data && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; } return put_user(answ, (int __user *)arg); } EXPORT_SYMBOL(tcp_ioctl); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; } static inline bool forced_push(const struct tcp_sock *tp) { return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); skb->csum = 0; tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; tcb->sacked = 0; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) { if (flags & MSG_OOB) tp->snd_up = tp->write_seq; } /* If a not yet filled skb is pushed, do not send it if * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize * than pure acks (dataless) packets, the last checks prevent * autocorking if we only have an ACK in Qdisc/NIC queues, * or if TX completion was delayed after we processed ACK packet. */ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && !tcp_rtx_queue_empty(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; skb = tcp_write_queue_tail(sk); if (!skb) return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags); if (tcp_should_autocork(sk, skb, size_goal)) { /* avoid atomic op if TSQ_THROTTLED bit is already set */ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. */ if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize) return; } if (flags & MSG_MORE) nonagle = TCP_NAGLE_CORK; __tcp_push_pending_frames(sk, mss_now, nonagle); } static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, min(rd_desc->count, len), tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; } static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) { /* Store TCP splice context information in read_descriptor_t. */ read_descriptor_t rd_desc = { .arg.data = tss, .count = tss->len, }; return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); } /** * tcp_splice_read - splice data from TCP socket to a pipe * @sock: socket to splice from * @ppos: position (not valid) * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given socket and fill them into a pipe. * **/ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct tcp_splice_state tss = { .pipe = pipe, .len = len, .flags = flags, }; long timeo; ssize_t spliced; int ret; sock_rps_record_flow(sk); /* * We can't seek on a socket input */ if (unlikely(*ppos)) return -ESPIPE; ret = spliced = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); while (tss.len) { ret = __tcp_splice_read(sk, &tss); if (ret < 0) break; else if (!ret) { if (spliced) break; if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { ret = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* * This occurs when user tries to read * from never connected socket. */ ret = -ENOTCONN; break; } if (!timeo) { ret = -EAGAIN; break; } /* if __tcp_splice_read() got nothing while we have * an skb in receive queue, we do not want to loop. * This might happen with URG data. */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; sk_wait_data(sk, &timeo, NULL); if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } continue; } tss.len -= ret; spliced += ret; if (!timeo) break; release_sock(sk); lock_sock(sk); if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } release_sock(sk); if (spliced) return spliced; return ret; } EXPORT_SYMBOL(tcp_splice_read); struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, bool force_schedule) { struct sk_buff *skb; if (likely(!size)) { skb = sk->sk_tx_skb_cache; if (skb) { skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); sk->sk_tx_skb_cache = NULL; pskb_trim(skb, 0); INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_shinfo(skb)->tx_flags = 0; memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); return skb; } } /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); if (unlikely(tcp_under_memory_pressure(sk))) sk_mem_reclaim_partial(sk); skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); if (likely(skb)) { bool mem_scheduled; if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); } else { mem_scheduled = sk_wmem_schedule(sk, skb->truesize); } if (likely(mem_scheduled)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ skb->reserved_tailroom = skb->end - skb->tail - size; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; } static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; if (!large_allowed) return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER; new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); /* We try hard to avoid divides here */ size_goal = tp->gso_segs * mss_now; if (unlikely(new_size_goal < size_goal || new_size_goal >= size_goal + mss_now)) { tp->gso_segs = min_t(u16, new_size_goal / mss_now, sk->sk_gso_max_segs); size_goal = tp->gso_segs * mss_now; } return max(size_goal, mss_now); } int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; mss_now = tcp_current_mss(sk); *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); return mss_now; } /* In some cases, both sendpage() and sendmsg() could have added * an skb to the write queue, but failed adding payload on it. * We need to remove it to consume less memory, but more * importantly be able to generate EPOLLOUT for Edge Trigger epoll() * users. */ void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) { if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); sk_wmem_free_skb(sk, skb); } } struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, struct page *page, int offset, size_t *size) { struct sk_buff *skb = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); bool can_coalesce; int copy, i; if (!skb || (copy = size_goal - skb->len) <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) return NULL; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, tcp_rtx_and_write_queues_empty(sk)); if (!skb) return NULL; #ifdef CONFIG_TLS_DEVICE skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); #endif tcp_skb_entail(sk, skb); copy = size_goal; } if (copy > *size) copy = *size; i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, page, offset); if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } if (!sk_wmem_schedule(sk, copy)) return NULL; if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(page); skb_fill_page_desc_noacc(skb, i, page, offset, copy); } if (!(flags & MSG_NO_SHARED_FRAGS)) skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; skb->len += copy; skb->data_len += copy; skb->truesize += copy; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); *size = copy; return skb; } ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) { struct tcp_sock *tp = tcp_sk(sk); int mss_now, size_goal; int err; ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ONCE(!sendpage_ok(page), "page must not be a Slab one and have page_count > 0")) return -EINVAL; /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto out_err; } sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; while (size > 0) { struct sk_buff *skb; size_t copy = size; skb = tcp_build_frag(sk, size_goal, flags, page, offset, &copy); if (!skb) goto wait_for_space; if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; copied += copy; offset += copy; size -= copy; if (!size) goto out; if (skb->len < size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); } out: if (copied) { tcp_tx_timestamp(sk, sk->sk_tsflags); if (!(flags & MSG_SENDPAGE_NOTLAST)) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } return copied; do_error: tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk)); if (copied) goto out; out_err: /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } return sk_stream_error(sk, flags, err); } EXPORT_SYMBOL_GPL(do_tcp_sendpages); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags) { if (!(sk->sk_route_caps & NETIF_F_SG)) return sock_no_sendpage_locked(sk, page, offset, size, flags); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ return do_tcp_sendpages(sk, page, offset, size, flags); } EXPORT_SYMBOL_GPL(tcp_sendpage_locked); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { int ret; lock_sock(sk); ret = tcp_sendpage_locked(sk, page, offset, size, flags); release_sock(sk); return ret; } EXPORT_SYMBOL(tcp_sendpage); void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } } static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sockaddr *uaddr = msg->msg_name; int err, flags; if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; tp->fastopen_req->uarg = uarg; if (inet->defer_connect) { err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) { tcp_set_state(sk, TCP_CLOSE); inet->inet_dport = 0; sk->sk_route_caps = 0; } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst */ if (tp->fastopen_req) { *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); inet->defer_connect = 0; } return err; } int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; bool zc = false; long timeo; flags = msg->msg_flags; if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; } zc = sk->sk_route_caps & NETIF_F_SG; if (!zc) uarg->zerocopy = 0; } if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) goto out_err; } timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto do_error; } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); goto out_nopush; } err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out_err; /* 'common' sending to sendq */ } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { err = -EINVAL; goto out_err; } } /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Ok commence sending. */ copied = 0; restart: mss_now = tcp_send_mss(sk, &size_goal, flags); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (msg_data_left(msg)) { int copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; if (sk_flush_backlog(sk)) goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, first_skb); if (!skb) goto wait_for_space; process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; tcp_skb_entail(sk, skb); copy = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation. */ if (tp->repair) TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; } /* Try to append data to the end of skb. */ if (copy > msg_data_left(msg)) copy = msg_data_left(msg); /* Where to copy to? */ if (skb_availroom(skb) > 0 && !zc) { /* We have some space in skb head. Superb! */ copy = min_t(int, copy, skb_availroom(skb)); err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; } else if (!zc) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } merge = false; } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); if (err) goto do_error; /* Update the skb. */ if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); page_ref_inc(pfrag->page); } pfrag->offset += copy; } else { if (!sk_wmem_schedule(sk, copy)) goto wait_for_space; err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; } if (err < 0) goto do_error; copy = err; } if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); copied += copy; if (!msg_data_left(msg)) { if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; goto out; } if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); } out: if (copied) { tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: net_zcopy_put(uarg); return copied + copied_syn; do_error: skb = tcp_write_queue_tail(sk); do_fault: tcp_remove_empty_skb(sk, skb); if (copied + copied_syn) goto out; out_err: net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int ret; lock_sock(sk); ret = tcp_sendmsg_locked(sk, msg, size); release_sock(sk); return ret; } EXPORT_SYMBOL(tcp_sendmsg); /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); /* No URG data to read. */ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) return -ENOTCONN; if (tp->urg_data & TCP_URG_VALID) { int err = 0; char c = tp->urg_data; if (!(flags & MSG_PEEK)) tp->urg_data = TCP_URG_READ; /* Read urgent data. */ msg->msg_flags |= MSG_OOB; if (len > 0) { if (!(flags & MSG_TRUNC)) err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; return err ? -EFAULT : len; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 0; /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the * blocking state of the socket. * Mike <pall@rz.uni-karlsruhe.de> */ return -EAGAIN; } static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) { struct sk_buff *skb; int copied = 0, err = 0; /* XXX -- need to support SO_PEEK_OFF */ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) return err; copied += skb->len; } skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; copied += skb->len; } return err ?: copied; } /* Clean up the receive buffer for full frames taken by the user, * then send an ACK if necessary. COPIED is the number of bytes * tcp_recvmsg has given to the user so far, it speeds up the * calculation of whether or not we must ACK for the sake of * a window update. */ void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". * * Even if window raised up to infinity, do not send window open ACK * in states, where we will not receive more. It is useless. */ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { __u32 rcv_window_now = tcp_receive_window(tp); /* Optimize, __tcp_select_window() is not cheap. */ if (2*rcv_window_now <= tp->window_clamp) { __u32 new_window = __tcp_select_window(sk); /* Send ACK now, if this read freed lots of space * in our buffer. Certainly, new_window is new window. * We can advertise it now, if it is not less than current one. * "Lots" means "at least twice" here. */ if (new_window && new_window >= 2 * rcv_window_now) time_to_ack = true; } } if (time_to_ack) tcp_send_ack(sk); } static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } /* This looks weird, but this can happen if TCP collapsing * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ sk_eat_skb(sk, skb); } return NULL; } /* * This routine provides an alternative to tcp_recvmsg() for routines * that would like to handle copying from skbuffs directly in 'sendfile' * fashion. * Note: * - It is assumed that the socket was locked by the caller. * - The routine does not block. * - At present, there is no support for reading OOB data * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; u32 offset; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { if (offset < skb->len) { int used; size_t len; len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ if (tp->urg_data) { u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len) len = urg_offset; if (!len) break; } used = recv_actor(desc, skb, offset, len); if (used <= 0) { if (!copied) copied = used; break; } if (WARN_ON_ONCE(used > len)) used = len; seq += used; copied += used; offset += used; /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it * while aggregating skbs from the socket queue. */ skb = tcp_recv_skb(sk, seq - 1, &offset); if (!skb) break; /* TCP coalescing might have appended data to the skb. * Try to splice more frags */ if (offset + 1 != skb->len) continue; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { sk_eat_skb(sk, skb); ++seq; break; } sk_eat_skb(sk, skb); if (!desc->count) break; WRITE_ONCE(tp->copied_seq, seq); } WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (copied > 0) { tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } return copied; } EXPORT_SYMBOL(tcp_read_sock); int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); } EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { int cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; val <<= 1; if (val > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, val); tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); } return 0; } EXPORT_SYMBOL(tcp_set_rcvlowat); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss) { if (skb->tstamp) tss->ts[0] = ktime_to_timespec64(skb->tstamp); else tss->ts[0] = (struct timespec64) {0}; if (skb_hwtstamps(skb)->hwtstamp) tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); else tss->ts[2] = (struct timespec64) {0}; } #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ vma->vm_flags |= VM_MIXEDMAP; vma->vm_ops = &tcp_vm_ops; return 0; } EXPORT_SYMBOL(tcp_mmap); static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, u32 *offset_frag) { skb_frag_t *frag; if (unlikely(offset_skb >= skb->len)) return NULL; offset_skb -= skb_headlen(skb); if ((int)offset_skb < 0 || skb_has_frag_list(skb)) return NULL; frag = skb_shinfo(skb)->frags; while (offset_skb) { if (skb_frag_size(frag) > offset_skb) { *offset_frag = offset_skb; return frag; } offset_skb -= skb_frag_size(frag); ++frag; } *offset_frag = 0; return frag; } static bool can_map_frag(const skb_frag_t *frag) { return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag); } static int find_next_mappable_frag(const skb_frag_t *frag, int remaining_in_skb) { int offset = 0; if (likely(can_map_frag(frag))) return 0; while (offset < remaining_in_skb && !can_map_frag(frag)) { offset += skb_frag_size(frag); ++frag; } return offset; } static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset) { u32 frag_offset, partial_frag_remainder = 0; int mappable_offset; skb_frag_t *frag; /* worst case: skip to next skb. try to improve on this case below */ zc->recv_skip_hint = skb->len - offset; /* Find the frag containing this offset (and how far into that frag) */ frag = skb_advance_to_frag(skb, offset, &frag_offset); if (!frag) return; if (frag_offset) { struct skb_shared_info *info = skb_shinfo(skb); /* We read part of the last frag, must recvmsg() rest of skb. */ if (frag == &info->frags[info->nr_frags - 1]) return; /* Else, we must at least read the remainder in this frag. */ partial_frag_remainder = skb_frag_size(frag) - frag_offset; zc->recv_skip_hint -= partial_frag_remainder; ++frag; } /* partial_frag_remainder: If part way through a frag, must read rest. * mappable_offset: Bytes till next mappable frag, *not* counting bytes * in partial_frag_remainder. */ mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint); zc->recv_skip_hint = mappable_offset + partial_frag_remainder; } static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; zc->length = 0; zc->recv_skip_hint = 0; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_single_range(READ, (void __user *)copy_address, inq, &iov, &msg.msg_iter); if (err) return err; err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0, tss, &zc->msg_flags); if (err < 0) return err; zc->copybuf_len = err; if (likely(zc->copybuf_len)) { struct sk_buff *skb; u32 offset; skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset); if (skb) tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset); } return 0; } static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_single_range(READ, (void __user *)copy_address, copylen, &iov, &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); if (err) return err; zc->recv_skip_hint -= copylen; *offset += copylen; *seq += copylen; return (__s32)copylen; } static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, struct sock *sk, struct sk_buff *skb, u32 *seq, s32 copybuf_len, struct scm_timestamping_internal *tss) { u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); if (!copylen) return 0; /* skb is null if inq < PAGE_SIZE. */ if (skb) { offset = *seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, *seq, &offset); if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } } zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, seq); return zc->copybuf_len < 0 ? 0 : copylen; } static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, unsigned long pages_remaining, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map, int err) { /* At least one page did not map. Try zapping if we skipped earlier. */ if (err == -EBUSY && zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { u32 maybe_zap_len; maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ zap_page_range(vma, *address, maybe_zap_len); err = 0; } if (!err) { unsigned long leftover_pages = pages_remaining; int bytes_mapped; /* We called zap_page_range, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); *seq += bytes_mapped; *address += bytes_mapped; } if (err) { /* Either we were unable to zap, OR we zapped, retried an * insert, and still had an issue. Either ways, pages_remaining * is the number of pages we were unable to map, and we unroll * some state we speculatively touched before. */ const int bytes_not_mapped = PAGE_SIZE * pages_remaining; *length -= bytes_not_mapped; zc->recv_skip_hint += bytes_not_mapped; } return err; } static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, unsigned int pages_to_map, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map) { unsigned long pages_remaining = pages_to_map; unsigned int pages_mapped; unsigned int bytes_mapped; int err; err = vm_insert_pages(vma, *address, pages, &pages_remaining); pages_mapped = pages_to_map - (unsigned int)pages_remaining; bytes_mapped = PAGE_SIZE * pages_mapped; /* Even if vm_insert_pages fails, it may have partially succeeded in * mapping (some but not all of the pages). */ *seq += bytes_mapped; *address += bytes_mapped; if (likely(!err)) return 0; /* Error: maybe zap and retry + rollback state for failed inserts. */ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, pages_remaining, address, length, seq, zc, total_bytes_to_map, err); } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { unsigned long msg_control_addr; struct msghdr cmsg_dummy; msg_control_addr = (unsigned long)zc->msg_control; cmsg_dummy.msg_control = (void *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; cmsg_dummy.msg_control_is_user = true; zc->msg_flags = 0; if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss); zc->msg_control = (__u64) ((uintptr_t)cmsg_dummy.msg_control); zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags; } } #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); const skb_frag_t *frags = NULL; unsigned int pages_to_map = 0; struct vm_area_struct *vma; struct sk_buff *skb = NULL; u32 seq = tp->copied_seq; u32 total_bytes_to_map; int inq = tcp_inq(sk); int ret; zc->copybuf_len = 0; zc->msg_flags = 0; if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; sock_rps_record_flow(sk); if (inq && inq <= copybuf_len) return receive_fallback_to_copy(sk, zc, inq, tss); if (inq < PAGE_SIZE) { zc->length = 0; zc->recv_skip_hint = inq; if (!inq && sock_flag(sk, SOCK_DONE)) return -EIO; return 0; } mmap_read_lock(current->mm); vma = vma_lookup(current->mm, address); if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(current->mm); return -EINVAL; } vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) zap_page_range(vma, address, total_bytes_to_map); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { zc->length = avail_len; zc->recv_skip_hint = avail_len; } ret = 0; while (length + PAGE_SIZE <= zc->length) { int mappable_offset; struct page *page; if (zc->recv_skip_hint < PAGE_SIZE) { u32 offset_frag; if (skb) { if (zc->recv_skip_hint > 0) break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, seq, &offset); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } zc->recv_skip_hint = skb->len - offset; frags = skb_advance_to_frag(skb, offset, &offset_frag); if (!frags || offset_frag) break; } mappable_offset = find_next_mappable_frag(frags, zc->recv_skip_hint); if (mappable_offset) { zc->recv_skip_hint = mappable_offset; break; } page = skb_frag_page(frags); prefetchw(page); pages[pages_to_map++] = page; length += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || zc->recv_skip_hint < PAGE_SIZE) { /* Either full batch, or we're about to go to next skb * (and we cannot unroll failed ops across skbs). */ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); if (ret) goto out; pages_to_map = 0; } } if (pages_to_map) { ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); } out: mmap_read_unlock(current->mm); /* Try to copy straggler data. */ if (!ret) copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, length + copylen); ret = 0; if (length == zc->length) zc->recv_skip_hint = 0; } else { if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) ret = -EIO; } zc->length = length; return ret; } #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_timespec kts = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { struct __kernel_old_timespec ts_old = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); } } else { if (new_tstamp) { struct __kernel_sock_timeval stv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { struct __kernel_old_timeval tv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } } } if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; } if (has_timestamping) { tss->ts[1] = (struct timespec64) {0}; if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, tss); else put_cmsg_scm_timestamping(msg, tss); } } static int tcp_inq_hint(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u32 copied_seq = READ_ONCE(tp->copied_seq); u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); int inq; inq = rcv_nxt - copied_seq; if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { lock_sock(sk); inq = tp->rcv_nxt - tp->copied_seq; release_sock(sk); } /* After receiving a FIN, tell the user-space to continue reading * by returning a non-zero inq. */ if (inq == 0 && sock_flag(sk, SOCK_DONE)) inq = 1; return inq; } /* * This routine copies from a sock struct into the user buffer. * * Technical note: in 2.3 we work on _locked_ socket, so that * tricks with *seq access order and skb->users are not required. * Probably, code can be easily improved even more. */ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; u32 peek_seq; u32 *seq; unsigned long used; int err; int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; if (tp->recvmsg_inq) *cmsg_flags = TCP_CMSG_INQ; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) goto recv_urg; if (unlikely(tp->repair)) { err = -EPERM; if (!(flags & MSG_PEEK)) goto out; if (tp->repair_queue == TCP_SEND_QUEUE) goto recv_sndq; err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out; /* 'common' recv queue MSG_PEEK-ing */ } seq = &tp->copied_seq; if (flags & MSG_PEEK) { peek_seq = tp->copied_seq; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); do { u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ if (tp->urg_data && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } } /* Next get a buffer. */ last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { last = skb; /* Now that we have two receive queues this * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; offset = *seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || signal_pending(current)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } tcp_cleanup_rbuf(sk, copied); if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); } else { sk_wait_data(sk, &timeo, last); } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = tp->copied_seq; } continue; found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; /* Do we have urgent data here? */ if (tp->urg_data) { u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; if (!used) goto skip_copy; } } else used = urg_offset; } } if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; tcp_rcv_space_adjust(sk); skip_copy: if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { tp->urg_data = 0; tcp_fast_path_check(sk); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= TCP_CMSG_TS; } if (used + offset < skb->len) continue; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; } while (len > 0); /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); return copied; out: return err; recv_urg: err = tcp_recv_urg(sk, msg, len, flags); goto out; recv_sndq: err = tcp_peek_sndq(sk, msg, len); goto out; } int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { int cmsg_flags = 0, ret, inq; struct scm_timestamping_internal tss; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) sk_busy_loop(sk, nonblock); lock_sock(sk); ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss, &cmsg_flags); release_sock(sk); if (cmsg_flags && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (cmsg_flags & TCP_CMSG_INQ) { inq = tcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } } return ret; } EXPORT_SYMBOL(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; /* We defined a new enum for TCP states that are exported in BPF * so as not force the internal TCP states to be frozen. The * following checks will detect if an internal state value ever * differs from the BPF value. If this ever happens, then we will * need to remap the internal value to the BPF value before calling * tcp_call_bpf_2arg. */ BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); /* bpf uapi header bpf.h defines an anonymous enum with values * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. * But clang built vmlinux does not have this enum in DWARF * since clang removes the above code before generating IR/debuginfo. * Let us explicitly emit the type debuginfo to ensure the * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF * regardless of which compiler is used. */ BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == TCP_ESTABLISHED) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_state_store(sk, state); } EXPORT_SYMBOL_GPL(tcp_set_state); /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; tcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } /* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). */ void tcp_shutdown(struct sock *sk, int how) { /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. */ if (!(how & SEND_SHUTDOWN)) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) tcp_send_fin(sk); } } EXPORT_SYMBOL(tcp_shutdown); int tcp_orphan_count_sum(void) { int i, total = 0; for_each_possible_cpu(i) total += per_cpu(tcp_orphan_count, i); return max(total, 0); } static int tcp_orphan_cache; static struct timer_list tcp_orphan_timer; #define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) static void tcp_orphan_update(struct timer_list *unused) { WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); } static bool tcp_too_many_orphans(int shift) { return READ_ONCE(tcp_orphan_cache) << shift > READ_ONCE(sysctl_tcp_max_orphans); } bool tcp_check_oom(struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; too_many_orphans = tcp_too_many_orphans(shift); out_of_socket_memory = tcp_out_of_memory(sk); if (too_many_orphans) net_info_ratelimited("too many orphaned sockets\n"); if (out_of_socket_memory) net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); return too_many_orphans || out_of_socket_memory; } void __tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; int state; sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) len--; data_was_unread += len; __kfree_skb(skb); } sk_mem_reclaim(sk); /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk * GET in an FTP client, suspend the process, wait for the client to * advertise a zero window, then kill -9 the FTP client, wheee... * Note: timeout is always zero in such a case. */ if (unlikely(tcp_sk(sk)->repair)) { sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. */ /* RED-PEN. Formally speaking, we have broken TCP state * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), * rather than queued out of window. Purists blame. * * F.e. "RFC state" is ESTABLISHED, * if Linux state is FIN-WAIT-1, but FIN is still not sent. * * The visible declinations are that sometimes * we enter time-wait state, when it is not required really * (harmless), do not send active resets, when they are * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when * they look as CLOSING or LAST_ACK for Linux) * Probably, I missed some more holelets. * --ANK * XXX (TFO) - To start off we don't support SYN+ACK+FIN * in a single packet! (May consider it later but will * probably need API support or TCP_CORK SYN-ACK until * data is written and socket is closed.) */ tcp_send_fin(sk); } sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); local_bh_disable(); bh_lock_sock(sk); /* remove backlog if any, without releasing ownership. */ __release_sock(sk); this_cpu_inc(tcp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) goto out; /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. * * Nope, it was not mistake. It is really desired behaviour * f.e. on http servers, when such sockets are useless, but * consume significant resources. Let's do it with special * linger2 option. --ANK */ if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); if (tp->linger2 < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } } if (sk->sk_state != TCP_CLOSE) { sk_mem_reclaim(sk); if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { /* Not possible to send reset; just close */ tcp_set_state(sk, TCP_CLOSE); } } if (sk->sk_state == TCP_CLOSE) { struct request_sock *req; req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. */ if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); } void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); /* These states need RST on ABORT according to RFC793 */ static inline bool tcp_need_reset(int state) { return (1 << state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); tcp_sk(sk)->highest_sack = NULL; while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); /* Since we are deleting whole queue, no need to * list_del(&skb->tcp_tsorted_anchor) */ tcp_rtx_queue_unlink(skb, sk); sk_wmem_free_skb(sk, skb); } } void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { tcp_skb_tsorted_anchor_cleanup(skb); sk_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); skb = sk->sk_tx_skb_cache; if (skb) { __kfree_skb(skb); sk->sk_tx_skb_cache = NULL; } INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; inet_csk(sk)->icsk_backoff = 0; } int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { sk->sk_err = ECONNABORTED; } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); sk->sk_err = ECONNRESET; } else if (old_state == TCP_SYN_SENT) sk->sk_err = ECONNRESET; tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); if (sk->sk_rx_skb_cache) { __kfree_skb(sk->sk_rx_skb_cache); sk->sk_rx_skb_cache = NULL; } WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; seq = tp->write_seq + tp->max_window + 2; if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; tp->is_cwnd_limited = 0; tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); tp->total_retrans = 0; inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; tp->segs_out = 0; tp->bytes_sent = 0; tp->bytes_acked = 0; tp->bytes_received = 0; tp->bytes_retrans = 0; tp->data_segs_in = 0; tp->data_segs_out = 0; tp->duplicate_sack[0].start_seq = 0; tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; tp->reord_seen = 0; tp->retrans_out = 0; tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; tp->rack.mstamp = 0; tp->rack.advanced = 0; tp->rack.reo_wnd_steps = 1; tp->rack.last_delivered = 0; tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet->defer_connect = 0; tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } sk_error_report(sk); return 0; } EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && (sk->sk_state != TCP_LISTEN); } static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { struct tcp_repair_window opt; if (!tp->repair) return -EPERM; if (len != sizeof(opt)) return -EINVAL; if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) return -EFAULT; if (opt.max_window < opt.snd_wnd) return -EINVAL; if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) return -EINVAL; if (after(opt.rcv_wup, tp->rcv_nxt)) return -EINVAL; tp->snd_wl1 = opt.snd_wl1; tp->snd_wnd = opt.snd_wnd; tp->max_window = opt.max_window; tp->rcv_wnd = opt.rcv_wnd; tp->rcv_wup = opt.rcv_wup; return 0; } static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; size_t offset = 0; while (len >= sizeof(opt)) { if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { u16 snd_wscale = opt.opt_val & 0xFFFF; u16 rcv_wscale = opt.opt_val >> 16; if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) return -EFBIG; tp->rx_opt.snd_wscale = snd_wscale; tp->rx_opt.rcv_wscale = rcv_wscale; tp->rx_opt.wscale_ok = 1; } break; case TCPOPT_SACK_PERM: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.tstamp_ok = 1; break; } } return 0; } DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); EXPORT_SYMBOL(tcp_tx_delay_enabled); static void tcp_enable_tx_delay(void) { if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { static int __tcp_tx_delay_enabled = 0; if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { static_branch_enable(&tcp_tx_delay_enabled); pr_info("TCP_TX_DELAY enabled\n"); } } } /* When set indicates to always queue non-full frames. Later the user clears * this option and we transmit any pending partial frames in the queue. This is * meant to be used alongside sendfile() to get properly filled frames when the * user (for example) must write out headers with a write() call first and then * use sendfile to send out the data parts. * * TCP_CORK can be set together with TCP_NODELAY and it is stronger than * TCP_NODELAY. */ static void __tcp_sock_set_cork(struct sock *sk, bool on) { struct tcp_sock *tp = tcp_sk(sk); if (on) { tp->nonagle |= TCP_NAGLE_CORK; } else { tp->nonagle &= ~TCP_NAGLE_CORK; if (tp->nonagle & TCP_NAGLE_OFF) tp->nonagle |= TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } } void tcp_sock_set_cork(struct sock *sk, bool on) { lock_sock(sk); __tcp_sock_set_cork(sk, on); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_cork); /* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is * remembered, but it is not activated until cork is cleared. * * However, when TCP_NODELAY is set we make an explicit push, which overrides * even TCP_CORK for currently queued segments. */ static void __tcp_sock_set_nodelay(struct sock *sk, bool on) { if (on) { tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } else { tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; } } void tcp_sock_set_nodelay(struct sock *sk) { lock_sock(sk); __tcp_sock_set_nodelay(sk, true); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_nodelay); static void __tcp_sock_set_quickack(struct sock *sk, int val) { if (!val) { inet_csk_enter_pingpong_mode(sk); return; } inet_csk_exit_pingpong_mode(sk); if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) inet_csk_enter_pingpong_mode(sk); } } void tcp_sock_set_quickack(struct sock *sk, int val) { lock_sock(sk); __tcp_sock_set_quickack(sk, val); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_quickack); int tcp_sock_set_syncnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; lock_sock(sk); inet_csk(sk)->icsk_syn_retries = val; release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); void tcp_sock_set_user_timeout(struct sock *sk, u32 val) { lock_sock(sk); inet_csk(sk)->icsk_user_timeout = val; release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_user_timeout); int tcp_sock_set_keepidle_locked(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (val < 1 || val > MAX_TCP_KEEPIDLE) return -EINVAL; tp->keepalive_time = val * HZ; if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { u32 elapsed = keepalive_time_elapsed(tp); if (tp->keepalive_time > elapsed) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; inet_csk_reset_keepalive_timer(sk, elapsed); } return 0; } int tcp_sock_set_keepidle(struct sock *sk, int val) { int err; lock_sock(sk); err = tcp_sock_set_keepidle_locked(sk, val); release_sock(sk); return err; } EXPORT_SYMBOL(tcp_sock_set_keepidle); int tcp_sock_set_keepintvl(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; lock_sock(sk); tcp_sk(sk)->keepalive_intvl = val * HZ; release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); int tcp_sock_set_keepcnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; lock_sock(sk); tcp_sk(sk)->keepalive_probes = val; release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; tp->window_clamp = 0; } else { tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? SOCK_MIN_RCVBUF / 2 : val; tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); } return 0; } /* * Socket option code for TCP. */ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); int val; int err = 0; /* These are data/string values, all the others are ints */ switch (optname) { case TCP_CONGESTION: { char name[TCP_CA_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; lock_sock(sk); err = tcp_set_congestion_control(sk, name, true, ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); release_sock(sk); return err; } case TCP_ULP: { char name[TCP_ULP_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_ULP_NAME_MAX - 1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; lock_sock(sk); err = tcp_set_ulp(sk, name); release_sock(sk); return err; } case TCP_FASTOPEN_KEY: { __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; __u8 *backup_key = NULL; /* Allow a backup key as well to facilitate key rotation * First key is the active one. */ if (optlen != TCP_FASTOPEN_KEY_LENGTH && optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_sockptr(key, optval, optlen)) return -EFAULT; if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) backup_key = key + TCP_FASTOPEN_KEY_LENGTH; return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ break; } if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); switch (optname) { case TCP_MAXSEG: /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { err = -EINVAL; break; } tp->rx_opt.user_mss = val; break; case TCP_NODELAY: __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: if (val < 0 || val > 1) err = -EINVAL; else tp->thin_lto = val; break; case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; break; case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); } else if (val == TCP_REPAIR_OFF_NO_WP) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; break; case TCP_REPAIR_QUEUE: if (!tp->repair) err = -EPERM; else if ((unsigned int)val < TCP_QUEUES_NR) tp->repair_queue = val; else err = -EINVAL; break; case TCP_QUEUE_SEQ: if (sk->sk_state != TCP_CLOSE) { err = -EPERM; } else if (tp->repair_queue == TCP_SEND_QUEUE) { if (!tcp_rtx_queue_empty(sk)) err = -EPERM; else WRITE_ONCE(tp->write_seq, val); } else if (tp->repair_queue == TCP_RECV_QUEUE) { if (tp->rcv_nxt != tp->copied_seq) { err = -EPERM; } else { WRITE_ONCE(tp->rcv_nxt, val); WRITE_ONCE(tp->copied_seq, val); } } else { err = -EINVAL; } break; case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; case TCP_CORK: __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; case TCP_KEEPINTVL: if (val < 1 || val > MAX_TCP_KEEPINTVL) err = -EINVAL; else tp->keepalive_intvl = val * HZ; break; case TCP_KEEPCNT: if (val < 1 || val > MAX_TCP_KEEPCNT) err = -EINVAL; else tp->keepalive_probes = val; break; case TCP_SYNCNT: if (val < 1 || val > MAX_TCP_SYNCNT) err = -EINVAL; else icsk->icsk_syn_retries = val; break; case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; break; case TCP_LINGER2: if (val < 0) tp->linger2 = -1; else if (val > TCP_FIN_TIMEOUT_MAX / HZ) tp->linger2 = TCP_FIN_TIMEOUT_MAX; else tp->linger2 = val * HZ; break; case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ icsk->icsk_accept_queue.rskq_defer_accept = secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: __tcp_sock_set_quickack(sk, val); break; #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_USER_TIMEOUT: /* Cap the max time in ms TCP will retry or probe the window * before giving up and aborting (ETIMEDOUT) a connection. */ if (val < 0) err = -EINVAL; else icsk->icsk_user_timeout = val; break; case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { err = -EINVAL; } break; case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else err = -EINVAL; } else { err = -EOPNOTSUPP; } break; case TCP_FASTOPEN_NO_COOKIE: if (val > 1 || val < 0) err = -EINVAL; else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) err = -EINVAL; else tp->fastopen_no_cookie = val; break; case TCP_TIMESTAMP: if (!tp->repair) err = -EPERM; else tp->tsoffset = val - tcp_time_stamp_raw(); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: tp->notsent_lowat = val; sk->sk_write_space(sk); break; case TCP_INQ: if (val > 1 || val < 0) err = -EINVAL; else tp->recvmsg_inq = val; break; case TCP_TX_DELAY: if (val) tcp_enable_tx_delay(); tp->tcp_tx_delay = val; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) return icsk->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { u64 stats[__TCP_CHRONO_MAX], total = 0; enum tcp_chrono i; for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { stats[i] = tp->chrono_stat[i - 1]; if (i == tp->chrono_type) stats[i] += tcp_jiffies32 - tp->chrono_start; stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } info->tcpi_busy_time = total; info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; } /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); unsigned long rate; u32 now; u64 rate64; bool slow; memset(info, 0, sizeof(*info)); if (sk->sk_type != SOCK_STREAM) return; info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_pacing_rate = rate64; rate = READ_ONCE(sk->sk_max_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; info->tcpi_snd_cwnd = tcp_snd_cwnd(tp); if (info->tcpi_state == TCP_LISTEN) { /* listeners aliased fields : * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } slow = lock_sock_fast(sk); info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; if (tp->rx_opt.wscale_ok) { info->tcpi_options |= TCPI_OPT_WSCALE; info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } if (tp->ecn_flags & TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = tp->srtt_us >> 3; info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_advmss = tp->advmss; info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); tcp_get_info_chrono_stats(tp, info); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); static size_t tcp_opt_stats_get_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ 0; } /* Returns TTL or hop limit of an incoming packet from skb. */ static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return ip_hdr(skb)->ttl; else if (skb->protocol == htons(ETH_P_IPV6)) return ipv6_hdr(skb)->hop_limit; else return 0; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb, const struct sk_buff *ack_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; unsigned long rate; u64 rate64; stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; tcp_get_info_chrono_stats(tp, &info); nla_put_u64_64bit(stats, TCP_NLA_BUSY, info.tcpi_busy_time, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, info.tcpi_rwnd_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, info.tcpi_sndbuf_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); rate64 = tcp_compute_delivery_rate(tp); nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); if (ack_skb) nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); return stats; } static int do_tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); int val, len; if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); if (len < 0) return -EINVAL; switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; case TCP_NODELAY: val = !!(tp->nonagle&TCP_NAGLE_OFF); break; case TCP_CORK: val = !!(tp->nonagle&TCP_NAGLE_CORK); break; case TCP_KEEPIDLE: val = keepalive_time_when(tp) / HZ; break; case TCP_KEEPINTVL: val = keepalive_intvl_when(tp) / HZ; break; case TCP_KEEPCNT: val = keepalive_probes(tp); break; case TCP_SYNCNT: val = icsk->icsk_syn_retries ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: val = tp->linger2; if (val >= 0) val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; break; case TCP_INFO: { struct tcp_info info; if (get_user(len, optlen)) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &info, len)) return -EFAULT; return 0; } case TCP_CC_INFO: { const struct tcp_congestion_ops *ca_ops; union tcp_cc_info info; size_t sz = 0; int attr; if (get_user(len, optlen)) return -EFAULT; ca_ops = icsk->icsk_ca_ops; if (ca_ops && ca_ops->get_info) sz = ca_ops->get_info(sk, ~0U, &attr, &info); len = min_t(unsigned int, len, sz); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &info, len)) return -EFAULT; return 0; } case TCP_QUICKACK: val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; case TCP_ULP: if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); if (!icsk->icsk_ulp_ops) { if (put_user(0, optlen)) return -EFAULT; return 0; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; case TCP_FASTOPEN_KEY: { u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; unsigned int key_len; if (get_user(len, optlen)) return -EFAULT; key_len = tcp_fastopen_get_cipher(net, icsk, key) * TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, key, len)) return -EFAULT; return 0; } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; case TCP_THIN_DUPACK: val = 0; break; case TCP_REPAIR: val = tp->repair; break; case TCP_REPAIR_QUEUE: if (tp->repair) val = tp->repair_queue; else return -EINVAL; break; case TCP_REPAIR_WINDOW: { struct tcp_repair_window opt; if (get_user(len, optlen)) return -EFAULT; if (len != sizeof(opt)) return -EINVAL; if (!tp->repair) return -EPERM; opt.snd_wl1 = tp->snd_wl1; opt.snd_wnd = tp->snd_wnd; opt.max_window = tp->max_window; opt.rcv_wnd = tp->rcv_wnd; opt.rcv_wup = tp->rcv_wup; if (copy_to_user(optval, &opt, len)) return -EFAULT; return 0; } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; else if (tp->repair_queue == TCP_RECV_QUEUE) val = tp->rcv_nxt; else return -EINVAL; break; case TCP_USER_TIMEOUT: val = icsk->icsk_user_timeout; break; case TCP_FASTOPEN: val = icsk->icsk_accept_queue.fastopenq.max_qlen; break; case TCP_FASTOPEN_CONNECT: val = tp->fastopen_connect; break; case TCP_FASTOPEN_NO_COOKIE: val = tp->fastopen_no_cookie; break; case TCP_TX_DELAY: val = tp->tcp_tx_delay; break; case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; case TCP_INQ: val = tp->recvmsg_inq; break; case TCP_SAVE_SYN: val = tp->save_syn; break; case TCP_SAVED_SYN: { if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); if (tp->saved_syn) { if (len < tcp_saved_syn_len(tp->saved_syn)) { if (put_user(tcp_saved_syn_len(tp->saved_syn), optlen)) { release_sock(sk); return -EFAULT; } release_sock(sk); return -EINVAL; } len = tcp_saved_syn_len(tp->saved_syn); if (put_user(len, optlen)) { release_sock(sk); return -EFAULT; } if (copy_to_user(optval, tp->saved_syn->data, len)) { release_sock(sk); return -EFAULT; } tcp_saved_syn_free(tp); release_sock(sk); } else { release_sock(sk); len = 0; if (put_user(len, optlen)) return -EFAULT; } return 0; } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; if (get_user(len, optlen)) return -EFAULT; if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { err = check_zeroed_user(optval + sizeof(zc), len - sizeof(zc)); if (err < 1) return err == 0 ? -EINVAL : err; len = sizeof(zc); if (put_user(len, optlen)) return -EFAULT; } if (copy_from_user(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) return -EINVAL; lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { case offsetofend(struct tcp_zerocopy_receive, msg_flags): goto zerocopy_rcv_cmsg; case offsetofend(struct tcp_zerocopy_receive, msg_controllen): case offsetofend(struct tcp_zerocopy_receive, msg_control): case offsetofend(struct tcp_zerocopy_receive, flags): case offsetofend(struct tcp_zerocopy_receive, copybuf_len): case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): goto zerocopy_rcv_inq; case offsetofend(struct tcp_zerocopy_receive, length): default: goto zerocopy_rcv_out; } zerocopy_rcv_cmsg: if (zc.msg_flags & TCP_CMSG_TS) tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); else zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk); zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: if (!err && copy_to_user(optval, &zc, len)) err = -EFAULT; return err; } #endif default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } bool tcp_bpf_bypass_getsockopt(int level, int optname) { /* TCP do_tcp_getsockopt has optimized getsockopt implementation * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. */ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) return true; return false; } EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) return icsk->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); return do_tcp_getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); static DEFINE_MUTEX(tcp_md5sig_mutex); static bool tcp_md5sig_pool_populated = false; static void __tcp_alloc_md5sig_pool(void) { struct crypto_ahash *hash; int cpu; hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(hash)) return; for_each_possible_cpu(cpu) { void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch; struct ahash_request *req; if (!scratch) { scratch = kmalloc_node(sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr), GFP_KERNEL, cpu_to_node(cpu)); if (!scratch) return; per_cpu(tcp_md5sig_pool, cpu).scratch = scratch; } if (per_cpu(tcp_md5sig_pool, cpu).md5_req) continue; req = ahash_request_alloc(hash, GFP_KERNEL); if (!req) return; ahash_request_set_callback(req, 0, NULL, NULL); per_cpu(tcp_md5sig_pool, cpu).md5_req = req; } /* before setting tcp_md5sig_pool_populated, we must commit all writes * to memory. See smp_rmb() in tcp_get_md5sig_pool() */ smp_wmb(); /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool() * and tcp_get_md5sig_pool(). */ WRITE_ONCE(tcp_md5sig_pool_populated, true); } bool tcp_alloc_md5sig_pool(void) { /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) { mutex_lock(&tcp_md5sig_mutex); if (!tcp_md5sig_pool_populated) { __tcp_alloc_md5sig_pool(); if (tcp_md5sig_pool_populated) static_branch_inc(&tcp_md5_needed); } mutex_unlock(&tcp_md5sig_mutex); } /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ return READ_ONCE(tcp_md5sig_pool_populated); } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); /** * tcp_get_md5sig_pool - get md5sig_pool for this user * * We use percpu structure, so if we succeed, we exit with preemption * and BH disabled, to make sure another thread or softirq handling * wont try to get same context. */ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { local_bh_disable(); /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ if (READ_ONCE(tcp_md5sig_pool_populated)) { /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ smp_rmb(); return this_cpu_ptr(&tcp_md5sig_pool); } local_bh_enable(); return NULL; } EXPORT_SYMBOL(tcp_get_md5sig_pool); int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, const struct sk_buff *skb, unsigned int header_len) { struct scatterlist sg; const struct tcphdr *tp = tcp_hdr(skb); struct ahash_request *req = hp->md5_req; unsigned int i; const unsigned int head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); struct sk_buff *frag_iter; sg_init_table(&sg, 1); sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); ahash_request_set_crypt(req, &sg, NULL, head_data_len); if (crypto_ahash_update(req)) return 1; for (i = 0; i < shi->nr_frags; ++i) { const skb_frag_t *f = &shi->frags[i]; unsigned int offset = skb_frag_off(f); struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset)); ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f)); if (crypto_ahash_update(req)) return 1; } skb_walk_frags(skb, frag_iter) if (tcp_md5_hash_skb_data(hp, frag_iter, 0)) return 1; return 0; } EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; sg_init_one(&sg, key->key, keylen); ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); /* We use data_race() because tcp_md5_do_add() might change key->key under us */ return data_race(crypto_ahash_update(hp->md5_req)); } EXPORT_SYMBOL(tcp_md5_hash_key); #endif void tcp_done(struct sock *sk) { struct request_sock *req; /* We might be called with a new socket, after * inet_csk_prepare_forced_close() has been called * so we can not use lockdep_sock_is_held(sk) */ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); if (req) reqsk_fastopen_remove(sk, req, false); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(tcp_done); int tcp_abort(struct sock *sk, int err) { if (!sk_fullsock(sk)) { if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); local_bh_disable(); inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } return -EOPNOTSUPP; } /* Don't race with userspace socket closes such as tcp_close. */ lock_sock(sk); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); inet_csk_listen_stop(sk); } /* Don't race with BH socket closes such as inet_csk_listen_stop. */ local_bh_disable(); bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_err = err; /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); } bh_unlock_sock(sk); local_bh_enable(); tcp_write_queue_purge(sk); release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; static int __init set_thash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtoul(str, 0, &thash_entries); if (ret) return 0; return 1; } __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 16; limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ sysctl_tcp_mem[1] = limit; /* 6.25 % */ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void) { int max_rshare, max_wshare, cnt; unsigned long limit; unsigned int i; BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof_field(struct sk_buff, cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); inet_hashinfo_init(&tcp_hashinfo); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); /* Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, 17, /* one slot per 128 KB of memory */ 0, NULL, &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, &tcp_hashinfo.bhash_size, NULL, 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; init_net.ipv4.sysctl_tcp_rmem[1] = 131072; init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); mptcp_init(); }
595 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/xattr.h Extended attributes handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #ifndef _LINUX_XATTR_H #define _LINUX_XATTR_H #include <linux/slab.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/user_namespace.h> #include <uapi/linux/xattr.h> struct inode; struct dentry; /* * struct xattr_handler: When @name is set, match attributes with exactly that * name. When @prefix is set instead, match attributes with that prefix and * with a non-empty suffix. */ struct xattr_handler { const char *name; const char *prefix; int flags; /* fs private flags */ bool (*list)(struct dentry *dentry); int (*get)(const struct xattr_handler *, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; const char *xattr_full_name(const struct xattr_handler *, const char *); struct xattr { const char *name; void *value; size_t value_len; }; ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t); ssize_t vfs_getxattr(struct user_namespace *, struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); int __vfs_setxattr(struct user_namespace *, struct dentry *, struct inode *, const char *, const void *, size_t, int); int __vfs_setxattr_noperm(struct user_namespace *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct user_namespace *, struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct user_namespace *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *); int __vfs_removexattr_locked(struct user_namespace *, struct dentry *, const char *, struct inode **); int vfs_removexattr(struct user_namespace *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); ssize_t vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); int xattr_supported_namespace(struct inode *inode, const char *prefix); static inline const char *xattr_prefix(const struct xattr_handler *handler) { return handler->prefix ?: handler->name; } struct simple_xattrs { struct list_head head; spinlock_t lock; }; struct simple_xattr { struct list_head list; char *name; size_t size; char value[]; }; /* * initialize the simple_xattrs structure */ static inline void simple_xattrs_init(struct simple_xattrs *xattrs) { INIT_LIST_HEAD(&xattrs->head); spin_lock_init(&xattrs->lock); } /* * free all the xattrs */ static inline void simple_xattrs_free(struct simple_xattrs *xattrs) { struct simple_xattr *xattr, *node; list_for_each_entry_safe(xattr, node, &xattrs->head, list) { kfree(xattr->name); kvfree(xattr); } } struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags, ssize_t *removed_size); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_list_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); #endif /* _LINUX_XATTR_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_STACK_H #define _LINUX_SCHED_TASK_STACK_H /* * task->stack (kernel stack) handling interfaces: */ #include <linux/sched.h> #include <linux/magic.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * When accessing the stack of a non-current task that might exit, use * try_get_task_stack() instead. task_stack_page will return a pointer * that could get freed out from under you. */ static __always_inline void *task_stack_page(const struct task_struct *task) { return task->stack; } #define setup_thread_stack(new,old) do { } while(0) static inline unsigned long *end_of_stack(const struct task_struct *task) { #ifdef CONFIG_STACK_GROWSUP return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; #else return task->stack; #endif } #elif !defined(__HAVE_THREAD_FUNCTIONS) #define task_stack_page(task) ((void *)(task)->stack) static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; } /* * Return the address of the last usable long on the stack. * * When the stack grows down, this is just above the thread * info struct. Going any lower will corrupt the threadinfo. * * When the stack grows up, this is the highest address. * Beyond that position, we corrupt data on the next page. */ static inline unsigned long *end_of_stack(struct task_struct *p) { #ifdef CONFIG_STACK_GROWSUP return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1; #else return (unsigned long *)(task_thread_info(p) + 1); #endif } #endif #ifdef CONFIG_THREAD_INFO_IN_TASK static inline void *try_get_task_stack(struct task_struct *tsk) { return refcount_inc_not_zero(&tsk->stack_refcount) ? task_stack_page(tsk) : NULL; } extern void put_task_stack(struct task_struct *tsk); #else static inline void *try_get_task_stack(struct task_struct *tsk) { return task_stack_page(tsk); } static inline void put_task_stack(struct task_struct *tsk) {} #endif #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) static inline int object_is_on_stack(const void *obj) { void *stack = task_stack_page(current); return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE static inline unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ # ifdef CONFIG_STACK_GROWSUP n--; # else n++; # endif } while (!*n); # ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; # else return (unsigned long)n - (unsigned long)end_of_stack(p); # endif } #endif extern void set_task_stack_end_magic(struct task_struct *tsk); #ifndef __HAVE_ARCH_KSTACK_END static inline int kstack_end(void *addr) { /* Reliable end of stack detection: * Some APM bios versions misalign the stack */ return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); } #endif #endif /* _LINUX_SCHED_TASK_STACK_H */
166 166 166 166 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc(sizeof(*qos), GFP_KERNEL); if (!qos) return -ENOMEM; n = kzalloc(3 * sizeof(*n), GFP_KERNEL); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);
12 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 /* * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H #include <uapi/linux/perf_event.h> #include <uapi/linux/bpf_perf_event.h> /* * Kernel-internal data types and definitions: */ #ifdef CONFIG_PERF_EVENTS # include <asm/perf_event.h> # include <asm/local64.h> #endif struct perf_guest_info_callbacks { int (*is_in_guest)(void); int (*is_user_mode)(void); unsigned long (*get_guest_ip)(void); void (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT #include <asm/hw_breakpoint.h> #endif #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> #include <linux/cgroup.h> #include <linux/refcount.h> #include <linux/security.h> #include <asm/local.h> struct perf_callchain_entry { __u64 nr; __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; short contexts; bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long off, unsigned long len); struct perf_raw_frag { union { struct perf_raw_frag *next; unsigned long pad; }; perf_copy_f copy; void *data; u32 size; } __packed; struct perf_raw_record { struct perf_raw_frag frag; u32 size; }; /* * branch stack layout: * nr: number of taken branches stored in entries[] * hw_idx: The low level index of raw branch records * for the most recent branch. * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. * The entries[] is an abstraction of raw branch records, * which may not be stored in age order in HW, e.g. Intel LBR. * The hw_idx is to expose the low level index of raw * branch record for the most recent branch aka entries[0]. * The hw_idx index is between -1 (unknown) and max depth, * which can be retrieved in /sys/devices/cpu/caps/branches. * For the architectures whose raw branch records are * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; __u64 hw_idx; struct perf_branch_entry entries[]; }; struct task_struct; /* * extra PMU register associated with an event */ struct hw_perf_event_extra { u64 config; /* register value */ unsigned int reg; /* register address or index */ int alloc; /* extra register already allocated */ int idx; /* index in shared_regs->regs[] */ }; /** * struct hw_perf_event - performance event hardware details: */ struct hw_perf_event { #ifdef CONFIG_PERF_EVENTS union { struct { /* hardware */ u64 config; u64 last_tag; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; int idx; int last_cpu; int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; struct { /* software */ struct hrtimer hrtimer; }; struct { /* tracepoint */ /* for tp_event->class */ struct list_head tp_list; }; struct { /* amd_power */ u64 pwr_acc; u64 ptsc; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct arch_hw_breakpoint info; struct list_head bp_list; }; #endif struct { /* amd_iommu */ u8 iommu_bank; u8 iommu_cntr; u16 padding; u64 conf; u64 conf1; }; }; /* * If the event is a per task event, this will point to the task in * question. See the comment in perf_event_alloc(). */ struct task_struct *target; /* * PMU would store hardware filter configuration * here. */ void *addr_filters; /* Last sync'ed generation of filters */ unsigned long addr_filters_gen; /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ #define PERF_HES_STOPPED 0x01 /* the counter is stopped */ #define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ #define PERF_HES_ARCH 0x04 int state; /* * The last observed hardware counter value, updated with a * local64_cmpxchg() such that pmu::read() can be called nested. */ local64_t prev_count; /* * The period to start the next sample with. */ u64 sample_period; union { struct { /* Sampling */ /* * The period we started this sample with. */ u64 last_period; /* * However much is left of the current period; * note that this is a full 64bit value and * allows for generation of periods longer * than hardware might allow. */ local64_t period_left; }; struct { /* Topdown events counting for context switch */ u64 saved_metric; u64 saved_slots; }; }; /* * State for throttling the event, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 interrupts_seq; u64 interrupts; /* * State for freq target events, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 freq_time_stamp; u64 freq_count_stamp; #endif }; struct perf_event; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ /** * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 #define PERF_PMU_CAP_NO_NMI 0x0002 #define PERF_PMU_CAP_AUX_NO_SG 0x0004 #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 #define PERF_PMU_CAP_EXCLUSIVE 0x0010 #define PERF_PMU_CAP_ITRACE 0x0020 #define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x0040 #define PERF_PMU_CAP_NO_EXCLUDE 0x0080 #define PERF_PMU_CAP_AUX_OUTPUT 0x0100 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0200 struct perf_output_handle; /** * struct pmu - generic performance monitoring unit */ struct pmu { struct list_head entry; struct module *module; struct device *dev; const struct attribute_group **attr_groups; const struct attribute_group **attr_update; const char *name; int type; /* * various common per-pmu feature flags */ int capabilities; int __percpu *pmu_disable_count; struct perf_cpu_context __percpu *pmu_cpu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. */ void (*pmu_enable) (struct pmu *pmu); /* optional */ void (*pmu_disable) (struct pmu *pmu); /* optional */ /* * Try and initialize the event for this PMU. * * Returns: * -ENOENT -- @event is not for this PMU * * -ENODEV -- @event is for this PMU but PMU not present * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * * Other error return values are allowed. */ int (*event_init) (struct perf_event *event); /* * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ #define PERF_EF_START 0x01 /* start the counter when adding */ #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ /* * Adds/Removes a counter to/from the PMU, can be done inside a * transaction, see the ->*_txn() methods. * * The add/del callbacks will reserve all hardware resources required * to service the event, this includes any counter constraint * scheduling etc. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on. * * ->add() called without PERF_EF_START should result in the same state * as ->add() followed by ->stop(). * * ->del() must always PERF_EF_UPDATE stop an event. If it calls * ->stop() that must deal with already being stopped without * PERF_EF_UPDATE. */ int (*add) (struct perf_event *event, int flags); void (*del) (struct perf_event *event, int flags); /* * Starts/Stops a counter present on the PMU. * * The PMI handler should stop the counter when perf_event_overflow() * returns !0. ->start() will be used to continue. * * Also used to change the sample period. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on -- will be called from NMI context with the PMU generates * NMIs. * * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); /* * Updates the counter value of the event. * * For sampling capable PMUs this will also update the software period * hw_perf_event::period_left field. */ void (*read) (struct perf_event *event); /* * Group events scheduling is treated as a transaction, add * group events as a whole and perform one schedulability test. * If the test fails, roll back the whole group * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. * * Optional. */ void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. * * Optional. */ int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. * * Optional. */ void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, * if no implementation is provided it will default to: event->hw.idx + 1. */ int (*event_idx) (struct perf_event *event); /*optional */ /* * context-switches callback */ void (*sched_task) (struct perf_event_context *ctx, bool sched_in); /* * Kmem cache of PMU specific data */ struct kmem_cache *task_ctx_cache; /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support * implementation and Perf core context switch handling callbacks for usage * examples. */ void (*swap_task_ctx) (struct perf_event_context *prev, struct perf_event_context *next); /* optional */ /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ /* * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ /* * Take a snapshot of the AUX buffer without touching the event * state, so that preempting ->start()/->stop() callbacks does * not interfere with their logic. Called in PMI context. * * Returns the size of AUX data copied to the output handle. * * Optional. */ long (*snapshot_aux) (struct perf_event *event, struct perf_output_handle *handle, unsigned long size); /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. * * Runs in the context of the ioctl()ing process and is not serialized * with the rest of the PMU callbacks. */ int (*addr_filters_validate) (struct list_head *filters); /* optional */ /* * Synchronize address range filter configuration: * translate hw-agnostic filters into hardware configuration in * event::hw::addr_filters. * * Runs as a part of filter sync sequence that is done in ->start() * callback by calling perf_event_addr_filters_sync(). * * May (and should) traverse event::addr_filters::list, for which its * caller provides necessary serialization. */ void (*addr_filters_sync) (struct perf_event *event); /* optional */ /* * Check if event can be used for aux_output purposes for * events of this PMU. * * Runs from perf_event_open(). Should return 0 for "no match" * or non-zero for "match". */ int (*aux_output_match) (struct perf_event *event); /* optional */ /* * Filter events for PMU-specific reasons. */ int (*filter_match) (struct perf_event *event); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { PERF_ADDR_FILTER_ACTION_STOP = 0, PERF_ADDR_FILTER_ACTION_START, PERF_ADDR_FILTER_ACTION_FILTER, }; /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { struct list_head entry; struct path path; unsigned long offset; unsigned long size; enum perf_addr_filter_action_t action; }; /** * struct perf_addr_filters_head - container for address range filters * @list: list of filters for this event * @lock: spinlock that serializes accesses to the @list and event's * (and its children's) filter generations. * @nr_file_filters: number of file-based filters * * A child event will use parent's @list (and therefore @lock), so they are * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { struct list_head list; raw_spinlock_t lock; unsigned int nr_file_filters; }; struct perf_addr_filter_range { unsigned long start; unsigned long size; }; /** * enum perf_event_state - the states of an event: */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, }; struct file; struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); /* * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { struct hlist_head heads[SWEVENT_HLIST_SIZE]; struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 #define PERF_ATTACH_SCHED_CB 0x20 #define PERF_ATTACH_CHILD 0x40 struct perf_cgroup; struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; struct list_head list; }; #define for_each_sibling_event(sibling, event) \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) /** * struct perf_event - performance event kernel representation: */ struct perf_event { #ifdef CONFIG_PERF_EVENTS /* * entry onto perf_event_context::event_list; * modifications require ctx->lock * RCU safe iterations. */ struct list_head event_entry; /* * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ struct list_head sibling_list; struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ struct rb_node group_node; u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the * group in tact which avoids us using the other two entries. */ struct list_head migrate_entry; struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ int event_caps; /* The cumulative AND of all event_caps for events in this group. */ int group_caps; struct perf_event *group_leader; struct pmu *pmu; void *pmu_private; enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; /* * These are the total time in nanoseconds that the event * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. */ u64 total_time_enabled; u64 total_time_running; u64 tstamp; struct perf_event_attr attr; u16 header_size; u16 id_header_size; u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children * events have been enabled and running, respectively. */ atomic64_t child_total_time_enabled; atomic64_t child_total_time_running; /* * Protect attach/detach and child_list: */ struct mutex child_mutex; struct list_head child_list; struct perf_event *parent; int oncpu; int cpu; struct list_head owner_entry; struct task_struct *owner; /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; /* delayed work for NMIs and such */ unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; unsigned int pending_sigtrap; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct callback_head pending_task; unsigned int pending_work; atomic_t event_limit; /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; /* for aux_output events */ struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; struct pid_namespace *ns; u64 id; u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; #ifdef CONFIG_BPF_SYSCALL perf_overflow_handler_t orig_overflow_handler; struct bpf_prog *prog; u64 bpf_cookie; #endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; #endif #endif #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif #ifdef CONFIG_SECURITY void *security; #endif struct list_head sb_list; #endif /* CONFIG_PERF_EVENTS */ }; struct perf_event_groups { struct rb_root tree; u64 index; }; /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { struct pmu *pmu; /* * Protect the states of the events in the list, * nr_active, and the list: */ raw_spinlock_t lock; /* * Protect the list of events. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ struct mutex mutex; struct list_head active_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; struct list_head pinned_active; struct list_head flexible_active; int nr_events; int nr_active; int is_active; int nr_stat; int nr_freq; int rotate_disable; /* * Set when nr_events != nr_active, except tolerant to events not * necessary to be active due to scheduling constraints, such as cgroups. */ int rotate_necessary; refcount_t refcount; struct task_struct *task; /* * Context clock, runs when context enabled. */ u64 time; u64 timestamp; u64 timeoffset; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ struct perf_event_context *parent_ctx; u64 parent_gen; u64 generation; int pin_count; #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; /* * Sum (event->pending_sigtrap + event->pending_work) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ local_t nr_pending; }; /* * Number of contexts where an event can trigger: * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 /** * struct perf_event_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int active_oncpu; int exclusive; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; struct list_head cgrp_cpuctx_entry; #endif struct list_head sched_cb_entry; int sched_cb_usage; int online; /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. */ int heap_size; struct perf_event **heap; struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; struct perf_buffer *rb; unsigned long wakeup; unsigned long size; u64 aux_flags; union { void *addr; unsigned long head; }; int page; }; struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF /* * perf_cgroup_info keeps track of time_enabled for a cgroup. * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; int active; }; struct perf_cgroup { struct cgroup_subsys_state css; struct perf_cgroup_info __percpu *info; }; /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { return container_of(task_css_check(task, perf_event_cgrp_id, ctx ? lockdep_is_held(&ctx->lock) : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ #ifdef CONFIG_PERF_EVENTS extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); struct perf_sample_data { /* * Fields set by perf_sample_data_init(), group so as to * minimize the cachelines touched. */ u64 addr; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 period; union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ u64 type; u64 ip; struct { u32 pid; u32 tid; } tid_entry; u64 time; u64 id; u64 stream_id; struct { u32 cpu; u32 reserved; } cpu_entry; struct perf_callchain_entry *callchain; u64 aux_size; struct perf_regs regs_user; struct perf_regs regs_intr; u64 stack_user_size; u64 phys_addr; u64 cgroup; u64 data_page_size; u64 code_page_size; } ____cacheline_aligned; /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ PERF_MEM_S(SNOOP, NA) |\ PERF_MEM_S(LOCK, NA) |\ PERF_MEM_S(TLB, NA)) static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ data->addr = addr; data->raw = NULL; data->br_stack = NULL; data->period = period; data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); static inline bool is_default_overflow_handler(struct perf_event *event) { if (likely(event->overflow_handler == perf_event_output_forward)) return true; if (unlikely(event->overflow_handler == perf_event_output_backward)) return true; return false; } extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); extern void perf_log_lost_samples(struct perf_event *event, u64 lost); static inline bool event_has_any_exclude_flag(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; return attr->exclude_idle || attr->exclude_user || attr->exclude_kernel || attr->exclude_hv || attr->exclude_guest || attr->exclude_host; } static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; } /* * Return 1 for a software event, 0 for a hardware event */ static inline int is_software_event(struct perf_event *event) { return event->event_caps & PERF_EV_CAP_SOFTWARE; } /* * Return 1 for event in sw context, 0 for event in hw context */ static inline int in_software_context(struct perf_event *event) { return event->ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) { return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* * When generating a perf sample in-line, instead of from an interrupt / * exception, we lack a pt_regs. This is typically used from software events * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. * * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests * - sp for PERF_SAMPLE_CALLCHAIN * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) * * NOTE: assumes @regs is otherwise already 0 filled; this is important for * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) __perf_sw_event(event_id, nr, regs, addr); } DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); /* * 'Special' version for the scheduler, it hard assumes no recursion, * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(event_id, nr, regs, addr); } extern struct static_key_false perf_sched_events; static __always_inline bool __perf_sw_enabled(int swevt) { return static_key_false(&perf_swevent_enabled[swevt]); } static inline void perf_event_task_migrate(struct task_struct *task) { if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) task->sched_migrated = 1; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && task->sched_migrated) { __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); task->sched_migrated = 0; } } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); #ifdef CONFIG_CGROUP_PERF if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && perf_cgroup_from_task(prev, NULL) != perf_cgroup_from_task(next, NULL)) __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); #endif if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) { /* * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading * the callbacks between a !NULL check and dereferences, to ensure * pending stores/changes to the callback pointers are visible before a * non-NULL perf_guest_cbs is visible to readers, and to prevent a * module from unloading callbacks while readers are active. */ return rcu_dereference(perf_guest_cbs); } extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); extern void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; } else { ctx->contexts_maxed = true; return -1; /* no more room, stop walking the stack */ } } static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ } } extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); int perf_proc_update_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 /* Finer grained perf_event_open(2) access control. */ #define PERF_SECURITY_CPU 1 #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } static inline int perf_allow_kernel(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_KERNEL); } static inline int perf_allow_cpu(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); } static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags # define perf_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) # define perf_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs #endif static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; } static inline bool has_aux(struct perf_event *event) { return event->pmu->setup_aux; } static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; } static inline bool has_addr_filter(struct perf_event *event) { return event->pmu->nr_addr_filters; } /* * An inherited event uses parent's filters */ static inline struct perf_addr_filters_head * perf_event_addr_filters(struct perf_event *event) { struct perf_addr_filters_head *ifh = &event->addr_filters; if (event->parent) ifh = &event->parent->addr_filters; return ifh; } extern void perf_event_addr_filters_sync(struct perf_event *event); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_migrate(struct task_struct *task) { } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child, u64 clone_flags) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event *perf_get_event(struct file *file) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } static inline int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline int perf_register_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline int perf_unregister_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } static inline void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else static inline void perf_restore_debug_store(void) { } #endif static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) { return frag->pad < sizeof(u64); } #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { struct device_attribute attr; u64 id; const char *event_str; }; struct perf_pmu_events_ht_attr { struct device_attribute attr; u64 id; const char *event_str_ht; const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { struct device_attribute attr; u64 id; const char *event_str; u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { struct device_attribute attr; u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, \ }; #define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = _str, \ }; #define PMU_EVENT_ATTR_ID(_name, _show, _id) \ (&((struct perf_pmu_events_attr[]) { \ { .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, } \ })[0].attr.attr) #define PMU_FORMAT_ATTR(_name, _format) \ static ssize_t \ _name##_show(struct device *dev, \ struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS int perf_event_init_cpu(unsigned int cpu); int perf_event_exit_cpu(unsigned int cpu); #else #define perf_event_init_cpu NULL #define perf_event_exit_cpu NULL #endif extern void __weak arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); #ifdef CONFIG_MMU extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr); #endif #endif /* _LINUX_PERF_EVENT_H */
18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 // SPDX-License-Identifier: GPL-2.0 /* User-mappable watch queue * * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/watch_queue.rst */ #ifndef _LINUX_WATCH_QUEUE_H #define _LINUX_WATCH_QUEUE_H #include <uapi/linux/watch_queue.h> #include <linux/kref.h> #include <linux/rcupdate.h> #ifdef CONFIG_WATCH_QUEUE struct cred; struct watch_type_filter { enum watch_notification_type type; __u32 subtype_filter[1]; /* Bitmask of subtypes to filter on */ __u32 info_filter; /* Filter on watch_notification::info */ __u32 info_mask; /* Mask of relevant bits in info_filter */ }; struct watch_filter { union { struct rcu_head rcu; /* Bitmask of accepted types */ DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ struct watch_type_filter filters[]; }; struct watch_queue { struct rcu_head rcu; struct watch_filter __rcu *filter; struct pipe_inode_info *pipe; /* The pipe we're using as a buffer */ struct hlist_head watches; /* Contributory watches */ struct page **notes; /* Preallocated notifications */ unsigned long *notes_bitmap; /* Allocation bitmap for notes */ struct kref usage; /* Object usage count */ spinlock_t lock; unsigned int nr_notes; /* Number of notes */ unsigned int nr_pages; /* Number of pages in notes[] */ bool defunct; /* T when queues closed */ }; /* * Representation of a watch on an object. */ struct watch { union { struct rcu_head rcu; u32 info_id; /* ID to be OR'd in to info field */ }; struct watch_queue __rcu *queue; /* Queue to post events to */ struct hlist_node queue_node; /* Link in queue->watches */ struct watch_list __rcu *watch_list; struct hlist_node list_node; /* Link in watch_list->watchers */ const struct cred *cred; /* Creds of the owner of the watch */ void *private; /* Private data for the watched object */ u64 id; /* Internal identifier */ struct kref usage; /* Object usage count */ }; /* * List of watches on an object. */ struct watch_list { struct rcu_head rcu; struct hlist_head watchers; void (*release_watch)(struct watch *); spinlock_t lock; }; extern void __post_watch_notification(struct watch_list *, struct watch_notification *, const struct cred *, u64); extern struct watch_queue *get_watch_queue(int); extern void put_watch_queue(struct watch_queue *); extern void init_watch(struct watch *, struct watch_queue *); extern int add_watch_to_object(struct watch *, struct watch_list *); extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool); extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int); extern long watch_queue_set_filter(struct pipe_inode_info *, struct watch_notification_filter __user *); extern int watch_queue_init(struct pipe_inode_info *); extern void watch_queue_clear(struct watch_queue *); static inline void init_watch_list(struct watch_list *wlist, void (*release_watch)(struct watch *)) { INIT_HLIST_HEAD(&wlist->watchers); spin_lock_init(&wlist->lock); wlist->release_watch = release_watch; } static inline void post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, u64 id) { if (unlikely(wlist)) __post_watch_notification(wlist, n, cred, id); } static inline void remove_watch_list(struct watch_list *wlist, u64 id) { if (wlist) { remove_watch_from_object(wlist, NULL, id, true); kfree_rcu(wlist, rcu); } } /** * watch_sizeof - Calculate the information part of the size of a watch record, * given the structure size. */ #define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT) #else static inline int watch_queue_init(struct pipe_inode_info *pipe) { return -ENOPKG; } #endif #endif /* _LINUX_WATCH_QUEUE_H */
1619 954 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __LICENSE_H #define __LICENSE_H static inline int license_is_gpl_compatible(const char *license) { return (strcmp(license, "GPL") == 0 || strcmp(license, "GPL v2") == 0 || strcmp(license, "GPL and additional rights") == 0 || strcmp(license, "Dual BSD/GPL") == 0 || strcmp(license, "Dual MIT/GPL") == 0 || strcmp(license, "Dual MPL/GPL") == 0); } #endif
38 3 21 38 38 38 38 38 38 38 23 23 1 1 1 38 53 38 23 53 39 21 1 38 22 49 15 41 22 38 15 14 4 52 52 52 41 15 3 5 1 1 2 53 38 38 1 55 55 53 53 21 38 1 5 53 50 3 52 22 40 45 14 38 1 17 564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <asm/unaligned.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ static const char *const tcp_conntrack_names[] = { "NONE", "SYN_SENT", "SYN_RECV", "ESTABLISHED", "FIN_WAIT", "CLOSE_WAIT", "LAST_ACK", "TIME_WAIT", "CLOSE", "SYN_SENT2", }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = { [TCP_CONNTRACK_SYN_SENT] = 2 MINS, [TCP_CONNTRACK_SYN_RECV] = 60 SECS, [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, [TCP_CONNTRACK_LAST_ACK] = 30 SECS, [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE] = 10 SECS, [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ [TCP_CONNTRACK_RETRANS] = 5 MINS, [TCP_CONNTRACK_UNACK] = 5 MINS, }; #define sNO TCP_CONNTRACK_NONE #define sSS TCP_CONNTRACK_SYN_SENT #define sSR TCP_CONNTRACK_SYN_RECV #define sES TCP_CONNTRACK_ESTABLISHED #define sFW TCP_CONNTRACK_FIN_WAIT #define sCW TCP_CONNTRACK_CLOSE_WAIT #define sLA TCP_CONNTRACK_LAST_ACK #define sTW TCP_CONNTRACK_TIME_WAIT #define sCL TCP_CONNTRACK_CLOSE #define sS2 TCP_CONNTRACK_SYN_SENT2 #define sIV TCP_CONNTRACK_MAX #define sIG TCP_CONNTRACK_IGNORE /* What TCP flags are set from RST/SYN/FIN/ACK. */ enum tcp_bit_set { TCP_SYN_SET, TCP_SYNACK_SET, TCP_FIN_SET, TCP_ACK_SET, TCP_RST_SET, TCP_NONE_SET, }; /* * The TCP state transition table needs a few words... * * We are the man in the middle. All the packets go through us * but might get lost in transit to the destination. * It is assumed that the destinations can't receive segments * we haven't seen. * * The checked segment is in window, but our windows are *not* * equivalent with the ones of the sender/receiver. We always * try to guess the state of the current sender. * * The meaning of the states are: * * NONE: initial state * SYN_SENT: SYN-only packet seen * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open * SYN_RECV: SYN-ACK packet seen * ESTABLISHED: ACK packet seen * FIN_WAIT: FIN packet seen * CLOSE_WAIT: ACK seen (after FIN) * LAST_ACK: FIN seen (after FIN) * TIME_WAIT: last ACK seen * CLOSE: closed connection (RST) * * Packets marked as IGNORED (sIG): * if they may be either invalid or valid * and the receiver may send back a connection * closing RST or a SYN/ACK. * * Packets marked as INVALID (sIV): * if we regard them as truly invalid packets */ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, /* * sNO -> sSS Initialize a new connection * sSS -> sSS Retransmitted SYN * sS2 -> sS2 Late retransmitted SYN * sSR -> sIG * sES -> sIG Error: SYNs in window outside the SYN_SENT state * are errors. Receiver will reply with RST * and close the connection. * Or we are not in sync and hold a dead connection. * sFW -> sIG * sCW -> sIG * sLA -> sIG * sTW -> sSS Reopened connection (RFC 1122). * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* * sNO -> sIV Too late and no reason to do anything * sSS -> sIV Client can't send SYN and then SYN/ACK * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open * sES -> sIV Invalid SYN/ACK packets sent by the client * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sIV * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sNO -> sIV Too late and no reason to do anything... * sSS -> sIV Client migth not send FIN in this state: * we enforce waiting for a SYN/ACK reply first. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions, waiting for * the last ACK. * Migth be a retransmitted FIN as well... * sCW -> sLA * sLA -> sLA Retransmitted FIN. Remain in the same state. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sNO -> sES Assumed. * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. * sS2 -> sIV * sSR -> sES Established state is reached. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { /* REPLY */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 }, /* * sNO -> sIV Never reached. * sSS -> sS2 Simultaneous open * sS2 -> sS2 Retransmitted simultaneous SYN * sSR -> sIV Invalid SYN packets sent by the server * sES -> sIV * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sSS Reopened connection, but server may have switched role * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* * sSS -> sSR Standard open. * sS2 -> sSR Simultaneous open * sSR -> sIG Retransmitted SYN/ACK, ignore it. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG Might be SYN/ACK answering ignored SYN * sCW -> sIG * sLA -> sIG * sTW -> sIG * sCL -> sIG */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sSS -> sIV Server might not send FIN in this state. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions. * sCW -> sLA * sLA -> sLA Retransmitted FIN. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, /* * sSS -> sIG Might be a half-open connection. * sS2 -> sIG * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) return; seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } #endif static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); else if (tcph->fin) return TCP_FIN_SET; else if (tcph->ack) return TCP_ACK_SET; else return TCP_NONE_SET; } /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering in IP Filter' by Guido van Rooij. http://www.sane.nl/events/sane2000/papers.html http://www.darkart.com/mirrors/www.obfuscation.org/ipf/ The boundaries and the conditions are changed according to RFC793: the packet must intersect the window (i.e. segments may be after the right or before the left edge) and thus receivers may ACK segments after the right edge of the window. td_maxend = max(sack + max(win,1)) seen in reply packets td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets td_maxwin += seq + len - sender.td_maxend if seq + len > sender.td_maxend td_end = max(seq + len) seen in sent packets I. Upper bound for valid data: seq <= sender.td_maxend II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin III. Upper bound for valid (s)ack: sack <= receiver.td_end IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW where sack is the highest right edge of sack block found in the packet or ack in the case of packet without SACK option. The upper bound limit for a valid (s)ack is not ignored - we doesn't have to deal with fragments. */ static inline __u32 segment_seq_plus_len(__u32 seq, size_t len, unsigned int dataoff, const struct tcphdr *tcph) { /* XXX Should I use payload length field in IP/IPv6 header ? * - YK */ return (seq + len - dataoff - tcph->doff*4 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); } /* Fixme: what about big packets? */ #define MAXACKWINCONST 66000 #define MAXACKWINDOW(sender) \ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ : MAXACKWINCONST) /* * Simplified tcp_parse_options routine from tcp_input.c */ static void tcp_options(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, struct ip_ct_tcp_state *state) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); if (!length) return; ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); if (!ptr) return; state->td_scale = 0; state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; while (length > 0) { int opcode=*ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_SACK_PERM && opsize == TCPOLEN_SACK_PERM) state->flags |= IP_CT_TCP_FLAG_SACK_PERM; else if (opcode == TCPOPT_WINDOW && opsize == TCPOLEN_WINDOW) { state->td_scale = *(u_int8_t *)ptr; if (state->td_scale > TCP_MAX_WSCALE) state->td_scale = TCP_MAX_WSCALE; state->flags |= IP_CT_TCP_FLAG_WINDOW_SCALE; } ptr += opsize - 2; length -= opsize; } } } static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, __u32 *sack) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); __u32 tmp; if (!length) return; ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); if (!ptr) return; /* Fast path for timestamp-only option */ if (length == TCPOLEN_TSTAMP_ALIGNED && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) return; while (length > 0) { int opcode = *ptr++; int opsize, i; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_SACK && opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK)) { for (i = 0; i < (opsize - TCPOLEN_SACK_BASE); i += TCPOLEN_SACK_PERBLOCK) { tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); if (after(tmp, *sack)) *sack = tmp; } return; } ptr += opsize - 2; length -= opsize; } } } static bool tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned int index, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, const struct nf_hook_state *hook_state) { struct ip_ct_tcp *state = &ct->proto.tcp; struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; u16 win_raw; s32 receiver_offset; bool res, in_recv_win; /* * Get the required data from the packet. */ seq = ntohl(tcph->seq); ack = sack = ntohl(tcph->ack_seq); win_raw = ntohs(tcph->window); win = win_raw; end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) tcp_sack(skb, dataoff, tcph, &sack); /* Take into account NAT sequence number mangling */ receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); ack -= receiver_offset; sack -= receiver_offset; pr_debug("tcp_in_window: START\n"); pr_debug("tcp_in_window: "); nf_ct_dump_tuple(tuple); pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", seq, ack, receiver_offset, sack, receiver_offset, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); if (sender->td_maxwin == 0) { /* * Initialize sender data. */ if (tcph->syn) { /* * SYN-ACK in reply to a SYN * or SYN from reply direction in simultaneous open. */ sender->td_end = sender->td_maxend = end; sender->td_maxwin = (win == 0 ? 1 : win); tcp_options(skb, dataoff, tcph, sender); /* * RFC 1323: * Both sides must send the Window Scale option * to enable window scaling in either direction. */ if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) sender->td_scale = receiver->td_scale = 0; if (!tcph->ack) /* Simultaneous open */ return true; } else { /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ sender->td_end = end; swin = win << sender->td_scale; sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; if (receiver->td_maxwin == 0) { /* We haven't seen traffic in the other * direction yet but we have to tweak window * tracking to pass III and IV until that * happens. */ receiver->td_end = receiver->td_maxend = sack; } else if (sack == receiver->td_end + 1) { /* Likely a reply to a keepalive. * Needed for III. */ receiver->td_end++; } } } else if (((state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL) || (state->state == TCP_CONNTRACK_SYN_RECV && dir == IP_CT_DIR_REPLY)) && after(end, sender->td_end)) { /* * RFC 793: "if a TCP is reinitialized ... then it need * not wait at all; it must only be sure to use sequence * numbers larger than those recently used." */ sender->td_end = sender->td_maxend = end; sender->td_maxwin = (win == 0 ? 1 : win); tcp_options(skb, dataoff, tcph, sender); } if (!(tcph->ack)) { /* * If there is no ACK, just pretend it was set and OK. */ ack = sack = receiver->td_end; } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == (TCP_FLAG_ACK|TCP_FLAG_RST)) && (ack == 0)) { /* * Broken TCP stacks, that set ACK in RST packets as well * with zero ack value. */ ack = sack = receiver->td_end; } if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) /* * RST sent answering SYN. */ seq = end = sender->td_end; pr_debug("tcp_in_window: "); nf_ct_dump_tuple(tuple); pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", seq, ack, receiver_offset, sack, receiver_offset, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); /* Is the ending sequence in the receive window (if available)? */ in_recv_win = !receiver->td_maxwin || after(end, sender->td_end - receiver->td_maxwin - 1); pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", before(seq, sender->td_maxend + 1), (in_recv_win ? 1 : 0), before(sack, receiver->td_end + 1), after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); if (before(seq, sender->td_maxend + 1) && in_recv_win && before(sack, receiver->td_end + 1) && after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { /* * Take into account window scaling (RFC 1323). */ if (!tcph->syn) win <<= sender->td_scale; /* * Update sender data. */ swin = win + (sack - ack); if (sender->td_maxwin < swin) sender->td_maxwin = swin; if (after(end, sender->td_end)) { sender->td_end = end; sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; } if (tcph->ack) { if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { sender->td_maxack = ack; sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; } else if (after(ack, sender->td_maxack)) sender->td_maxack = ack; } /* * Update receiver data. */ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) receiver->td_maxwin += end - sender->td_maxend; if (after(sack + win, receiver->td_maxend - 1)) { receiver->td_maxend = sack + win; if (win == 0) receiver->td_maxend++; } if (ack == receiver->td_end) receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; /* * Check retransmissions. */ if (index == TCP_ACK_SET) { if (state->last_dir == dir && state->last_seq == seq && state->last_ack == ack && state->last_end == end && state->last_win == win_raw) state->retrans++; else { state->last_dir = dir; state->last_seq = seq; state->last_ack = ack; state->last_end = end; state->last_win = win_raw; state->retrans = 0; } } res = true; } else { res = false; if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal) res = true; if (!res) { bool seq_ok = before(seq, sender->td_maxend + 1); if (!seq_ok) { u32 overshot = end - sender->td_maxend + 1; bool ack_ok; ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1); if (in_recv_win && ack_ok && overshot <= receiver->td_maxwin && before(sack, receiver->td_end + 1)) { /* Work around TCPs that send more bytes than allowed by * the receive window. * * If the (marked as invalid) packet is allowed to pass by * the ruleset and the peer acks this data, then its possible * all future packets will trigger 'ACK is over upper bound' check. * * Thus if only the sequence check fails then do update td_end so * possible ACK for this data can update internal state. */ sender->td_end = end; sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%u bytes more than expected", overshot); return res; } } nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", before(seq, sender->td_maxend + 1) ? in_recv_win ? before(sack, receiver->td_end + 1) ? after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" : "ACK is under the lower bound (possible overly delayed ACK)" : "ACK is over the upper bound (ACKed data not seen yet)" : "SEQ is under the lower bound (already ACKed data retransmitted)" : "SEQ is over the upper bound (over the window of the receiver)"); } } pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " "receiver end=%u maxend=%u maxwin=%u\n", res, sender->td_end, sender->td_maxend, sender->td_maxwin, receiver->td_end, receiver->td_maxend, receiver->td_maxwin); return res; } /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| TCPHDR_URG) + 1] = { [TCPHDR_SYN] = 1, [TCPHDR_SYN|TCPHDR_URG] = 1, [TCPHDR_SYN|TCPHDR_ACK] = 1, [TCPHDR_RST] = 1, [TCPHDR_RST|TCPHDR_ACK] = 1, [TCPHDR_FIN|TCPHDR_ACK] = 1, [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1, [TCPHDR_ACK] = 1, [TCPHDR_ACK|TCPHDR_URG] = 1, }; static void tcp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg); } /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ static bool tcp_error(const struct tcphdr *th, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int tcplen = skb->len - dataoff; u8 tcpflags; /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { tcp_error_log(skb, state, "truncated packet"); return true; } /* Checksum invalid? Ignore. * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING && nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) { tcp_error_log(skb, state, "bad checksum"); return true; } /* Check TCP flags. */ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { tcp_error_log(skb, state, "invalid tcp flag combination"); return true; } return false; } static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *th) { enum tcp_conntrack new_state; struct net *net = nf_ct_net(ct); const struct nf_tcp_net *tn = nf_tcp_pernet(net); const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; /* Don't need lock here: this conntrack not in circulation yet */ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { pr_debug("nf_ct_tcp: invalid new deleting.\n"); return false; } if (new_state == TCP_CONNTRACK_SYN_SENT) { memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* SYN packet */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (ct->proto.tcp.seen[0].td_maxwin == 0) ct->proto.tcp.seen[0].td_maxwin = 1; ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end; tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); } else if (tn->tcp_loose == 0) { /* Don't try to pick up connections. */ return false; } else { memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (ct->proto.tcp.seen[0].td_maxwin == 0) ct->proto.tcp.seen[0].td_maxwin = 1; ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end + ct->proto.tcp.seen[0].td_maxwin; /* We assume SACK and liberal window checking to handle * window scaling */ ct->proto.tcp.seen[0].flags = ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | IP_CT_TCP_FLAG_BE_LIBERAL; } /* tcp_packet will set them */ ct->proto.tcp.last_index = TCP_NONE_SET; pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", __func__, sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); return true; } static bool tcp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.tcp.state) { case TCP_CONNTRACK_FIN_WAIT: case TCP_CONNTRACK_LAST_ACK: case TCP_CONNTRACK_TIME_WAIT: case TCP_CONNTRACK_CLOSE: case TCP_CONNTRACK_CLOSE_WAIT: return true; default: break; } return false; } static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) { state->td_end = 0; state->td_maxend = 0; state->td_maxwin = 0; state->td_maxack = 0; state->td_scale = 0; state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; unsigned long timeout; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) return -NF_ACCEPT; if (tcp_error(th, skb, dataoff, state)) return -NF_ACCEPT; if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th)) return -NF_ACCEPT; spin_lock_bh(&ct->lock); old_state = ct->proto.tcp.state; dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; tuple = &ct->tuplehash[dir].tuple; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: if (old_state < TCP_CONNTRACK_TIME_WAIT) break; /* RFC 1122: "When a connection is closed actively, * it MUST linger in TIME-WAIT state for a time 2xMSL * (Maximum Segment Lifetime). However, it MAY accept * a new SYN from the remote TCP to reopen the connection * directly from TIME-WAIT state, if..." * We ignore the conditions because we are in the * TIME-WAIT state anyway. * * Handle aborted connections: we and the server * think there is an existing connection but the client * aborts it and starts a new one. */ if (((ct->proto.tcp.seen[dir].flags | ct->proto.tcp.seen[!dir].flags) & IP_CT_TCP_FLAG_CLOSE_INIT) || (ct->proto.tcp.last_dir == dir && ct->proto.tcp.last_index == TCP_RST_SET)) { /* Attempt to reopen a closed/aborted connection. * Delete this connection and look up again. */ spin_unlock_bh(&ct->lock); /* Only repeat if we can actually remove the timer. * Destruction may already be in progress in process * context and we must give it a chance to terminate. */ if (nf_ct_kill(ct)) return -NF_REPEAT; return NF_DROP; } fallthrough; case TCP_CONNTRACK_IGNORE: /* Ignored packets: * * Our connection entry may be out of sync, so ignore * packets which may signal the real connection between * the client and the server. * * a) SYN in ORIGINAL * b) SYN/ACK in REPLY * c) ACK in reply direction after initial SYN in original. * * If the ignored packet is invalid, the receiver will send * a RST we'll catch below. */ if (index == TCP_SYNACK_SET && ct->proto.tcp.last_index == TCP_SYN_SET && ct->proto.tcp.last_dir != dir && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* b) This SYN/ACK acknowledges a SYN that we earlier * ignored as invalid. This means that the client and * the server are both in sync, while the firewall is * not. We get in sync from the previously annotated * values. */ old_state = TCP_CONNTRACK_SYN_SENT; new_state = TCP_CONNTRACK_SYN_RECV; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = ct->proto.tcp.last_end; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = ct->proto.tcp.last_end; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = ct->proto.tcp.last_win == 0 ? 1 : ct->proto.tcp.last_win; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = ct->proto.tcp.last_wscale; ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]); break; } ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; ct->proto.tcp.last_seq = ntohl(th->seq); ct->proto.tcp.last_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.last_win = ntohs(th->window); /* a) This is a SYN in ORIGINAL. The client and the server * may be in sync but we are not. In that case, we annotate * the TCP options and let the packet go through. If it is a * valid SYN packet, the server will reply with a SYN/ACK, and * then we'll get in sync. Otherwise, the server potentially * responds with a challenge ACK if implementing RFC5961. */ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { struct ip_ct_tcp_state seen = {}; ct->proto.tcp.last_flags = ct->proto.tcp.last_wscale = 0; tcp_options(skb, dataoff, th, &seen); if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_WINDOW_SCALE; ct->proto.tcp.last_wscale = seen.td_scale; } if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_SACK_PERM; } /* Mark the potential for RFC5961 challenge ACK, * this pose a special problem for LAST_ACK state * as ACK is intrepretated as ACKing last FIN. */ if (old_state == TCP_CONNTRACK_LAST_ACK) ct->proto.tcp.last_flags |= IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "packet (index %d) in dir %d ignored, state %s", index, dir, tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or * the SYN/ACK from the server is lost, the client may transmit * a keep-alive packet while in SYN_SENT state. This needs to * be associated with the original conntrack entry in order to * generate a new SYN with the correct sequence number. */ if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); spin_unlock_bh(&ct->lock); return NF_ACCEPT; } /* Invalid packet */ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "invalid state"); return -NF_ACCEPT; case TCP_CONNTRACK_TIME_WAIT: /* RFC5961 compliance cause stack to send "challenge-ACK" * e.g. in response to spurious SYNs. Conntrack MUST * not believe this ACK is acking last FIN. */ if (old_state == TCP_CONNTRACK_LAST_ACK && index == TCP_ACK_SET && ct->proto.tcp.last_dir != dir && ct->proto.tcp.last_index == TCP_SYN_SET && (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) { /* Detected RFC5961 challenge ACK */ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored"); return NF_ACCEPT; /* Don't change state */ } break; case TCP_CONNTRACK_SYN_SENT2: /* tcp_conntracks table is not smart enough to handle * simultaneous open. */ ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN; break; case TCP_CONNTRACK_SYN_RECV: if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET && ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN) new_state = TCP_CONNTRACK_ESTABLISHED; break; case TCP_CONNTRACK_CLOSE: if (index != TCP_RST_SET) break; /* If we are closing, tuple might have been re-used already. * last_index, last_ack, and all other ct fields used for * sequence/window validation are outdated in that case. * * As the conntrack can already be expired by GC under pressure, * just skip validation checks. */ if (tcp_can_early_drop(ct)) goto in_window; /* td_maxack might be outdated if we let a SYN through earlier */ if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) && ct->proto.tcp.last_index != TCP_SYN_SET) { u32 seq = ntohl(th->seq); /* If we are not in established state and SEQ=0 this is most * likely an answer to a SYN we let go through above (last_index * can be updated due to out-of-order ACKs). */ if (seq == 0 && !nf_conntrack_tcp_established(ct)) break; if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) && !tn->tcp_ignore_invalid_rst) { /* Invalid RST */ spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst"); return -NF_ACCEPT; } if (!nf_conntrack_tcp_established(ct) || seq == ct->proto.tcp.seen[!dir].td_maxack) break; /* Check if rst is part of train, such as * foo:80 > bar:4379: P, 235946583:235946602(19) ack 42 * foo:80 > bar:4379: R, 235946602:235946602(0) ack 42 */ if (ct->proto.tcp.last_index == TCP_ACK_SET && ct->proto.tcp.last_dir == dir && seq == ct->proto.tcp.last_end) break; /* ... RST sequence number doesn't match exactly, keep * established state to allow a possible challenge ACK. */ new_state = old_state; } if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_SYN_SET) || (!test_bit(IPS_ASSURED_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* RST sent to invalid SYN or ACK we had let through * at a) and c) above: * * a) SYN was in window then * c) we hold a half-open connection. * * Delete our connection entry. * We skip window checking, because packet might ACK * segments we ignored. */ goto in_window; } break; default: /* Keep compilers happy. */ break; } if (!tcp_in_window(ct, dir, index, skb, dataoff, th, state)) { spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } in_window: /* From now on we have got in-window packets */ ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; pr_debug("tcp_conntracks: "); nf_ct_dump_tuple(tuple); pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", (th->syn ? 1 : 0), (th->ack ? 1 : 0), (th->fin ? 1 : 0), (th->rst ? 1 : 0), old_state, new_state); ct->proto.tcp.state = new_state; if (old_state != new_state && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = tn->timeouts; if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; else if (unlikely(index == TCP_RST_SET)) timeout = timeouts[TCP_CONNTRACK_CLOSE]; else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; else if (ct->proto.tcp.last_win == 0 && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; else timeout = timeouts[new_state]; spin_unlock_bh(&ct->lock); if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common problem case, so we can delete the conntrack immediately. --RR */ if (th->rst) { nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) { /* do not renew timeout on SYN retransmit. * * Else port reuse by client or NAT middlebox can keep * entry alive indefinitely (including nat info). */ return NF_ACCEPT; } /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection * pickup with loose=1. Avoid large ESTABLISHED timeout. */ if (new_state == TCP_CONNTRACK_ESTABLISHED && timeout > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { /* Set ASSURED if we see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); nf_conntrack_event_cache(IPCT_ASSURED, ct); } nf_ct_refresh_acct(ct, ctinfo, skb, timeout); return NF_ACCEPT; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, ct->proto.tcp.seen[0].td_scale) || nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, ct->proto.tcp.seen[1].td_scale)) goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[0].flags; if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[1].flags; if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, }; #define TCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags))) static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1]; int err; /* updates could not contain anything about the private * protocol info, in that case skip the parsing */ if (!pattr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy, NULL); if (err < 0) return err; if (tb[CTA_PROTOINFO_TCP_STATE] && nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) return -EINVAL; spin_lock_bh(&ct->lock); if (tb[CTA_PROTOINFO_TCP_STATE]) ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { struct nf_ct_tcp_flags *attr = nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); ct->proto.tcp.seen[0].flags &= ~attr->mask; ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask; } if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { struct nf_ct_tcp_flags *attr = nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]); ct->proto.tcp.seen[1].flags &= ~attr->mask; ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask; } if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] && tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { ct->proto.tcp.seen[0].td_scale = nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); ct->proto.tcp.seen[1].td_scale = nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); } spin_unlock_bh(&ct->lock); return 0; } static unsigned int tcp_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { struct nf_tcp_net *tn = nf_tcp_pernet(net); unsigned int *timeouts = data; int i; if (!timeouts) timeouts = tn->timeouts; /* set default TCP timeouts. */ for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) timeouts[i] = tn->timeouts[i]; if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) { timeouts[TCP_CONNTRACK_SYN_SENT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { timeouts[TCP_CONNTRACK_SYN_RECV] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; } if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) { timeouts[TCP_CONNTRACK_ESTABLISHED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ; } if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) { timeouts[TCP_CONNTRACK_FIN_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) { timeouts[TCP_CONNTRACK_CLOSE_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) { timeouts[TCP_CONNTRACK_LAST_ACK] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ; } if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) { timeouts[TCP_CONNTRACK_TIME_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_CLOSE]) { timeouts[TCP_CONNTRACK_CLOSE] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ; } if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) { timeouts[TCP_CONNTRACK_SYN_SENT2] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ; } if (tb[CTA_TIMEOUT_TCP_RETRANS]) { timeouts[TCP_CONNTRACK_RETRANS] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ; } if (tb[CTA_TIMEOUT_TCP_UNACK]) { timeouts[TCP_CONNTRACK_UNACK] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; } timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT]; return 0; } static int tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT, htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV, htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED, htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT, htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT, htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK, htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT, htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE, htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2, htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS, htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK, htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_tcp_init_net(struct net *net) { struct nf_tcp_net *tn = nf_tcp_pernet(net); int i; for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) tn->timeouts[i] = tcp_timeouts[i]; /* timeouts[0] is unused, make it same as SYN_SENT so * ->timeouts[0] contains 'new' timeout, like udp or icmp. */ tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; /* If it is set to zero, we disable picking up already established * connections. */ tn->tcp_loose = 1; /* "Be conservative in what you do, * be liberal in what you accept from others." * If it's non-zero, we mark only out of window RST segments as INVALID. */ tn->tcp_be_liberal = 0; /* If it's non-zero, we turn off RST sequence number check */ tn->tcp_ignore_invalid_rst = 0; /* Max number of the retransmitted packets without receiving an (acceptable) * ACK from the destination. If this number is reached, a shorter timer * will be started. */ tn->tcp_max_retrans = 3; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) tn->offload_timeout = 30 * HZ; #endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = { .l4proto = IPPROTO_TCP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = tcp_nlattr_tuple_size, .nlattr_size = TCP_NLATTR_SIZE, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = tcp_timeout_nlattr_to_obj, .obj_to_nlattr = tcp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_TCP_MAX, .obj_size = sizeof(unsigned int) * TCP_CONNTRACK_TIMEOUT_MAX, .nla_policy = tcp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
10 10 10 6 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <crypto/algapi.h> #include <crypto/sha2.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #include <net/tcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/ip6_route.h> #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include <uapi/linux/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> static void mptcp_subflow_ops_undo_override(struct sock *ssk); static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, enum linux_mptcp_mib_field field) { MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); pr_debug("subflow_req=%p", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); mptcp_token_destroy_request(req); } static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, void *hmac) { u8 msg[8]; put_unaligned_be32(nonce1, &msg[0]); put_unaligned_be32(nonce2, &msg[4]); mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); } static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) { return mptcp_is_fully_established((void *)msk) && READ_ONCE(msk->pm.accept_subflow); } /* validate received token and create truncated hmac and nonce for SYN-ACK */ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) { struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); subflow_generate_hmac(msk->local_key, msk->remote_key, subflow_req->local_nonce, subflow_req->remote_nonce, hmac); subflow_req->thmac = get_unaligned_be64(hmac); } static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_sock *msk; int local_id; msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; } local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); if (local_id < 0) { sock_put((struct sock *)msk); return NULL; } subflow_req->local_id = local_id; return msk; } static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) { struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (mpext) { memset(mpext, 0, sizeof(*mpext)); mpext->reset_reason = reason; } } /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset * should be sent. */ static int subflow_check_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) return -EINVAL; #endif mptcp_get_options(sk_listener, skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (opt_mp_join) return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } if (opt_mp_capable && listener->request_mptcp) { int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: do { get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); } while (subflow_req->local_key == 0); if (unlikely(req->syncookie)) { mptcp_crypto_key_sha(subflow_req->local_key, &subflow_req->token, &subflow_req->idsn); if (mptcp_token_exists(subflow_req->token)) { if (retries-- > 0) goto again; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else { subflow_req->mp_capable = 1; } return 0; } err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; else if (retries-- > 0) goto again; else SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else if (opt_mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; subflow_req->backup = mp_opt.backup; subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ if (!subflow_req->msk) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); return -EPERM; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); } subflow_req_create_thmac(subflow_req); if (unlikely(req->syncookie)) { if (mptcp_can_accept_new_subflow(subflow_req->msk)) subflow_init_req_cookie_join_save(subflow_req, skb); else return -EPERM; } pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; int err; subflow_init_req(req, sk_listener); mptcp_get_options(sk_listener, skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); if (opt_mp_capable && opt_mp_join) return -EINVAL; if (opt_mp_capable && listener->request_mptcp) { if (mp_opt.sndr_key == 0) return -EINVAL; subflow_req->local_key = mp_opt.rcvr_key; err = mptcp_token_new_request(req); if (err) return err; subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } else if (opt_mp_join && listener->request_mptcp) { if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) return -EINVAL; subflow_req->mp_join = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } return 0; } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp_request_sock_ops.send_reset(sk, skb); return NULL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp6_request_sock_ops.send_reset(sk, skb); return NULL; } #endif /* validate received truncated hmac and create hmac for third ACK */ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) { u8 hmac[SHA256_DIGEST_SIZE]; u64 thmac; subflow_generate_hmac(subflow->remote_key, subflow->local_key, subflow->remote_nonce, subflow->local_nonce, hmac); thmac = get_unaligned_be64(hmac); pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", subflow, subflow->token, (unsigned long long)thmac, (unsigned long long)subflow->thmac); return thmac == subflow->thmac; } void mptcp_subflow_reset(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && schedule_work(&mptcp_sk(sk)->work)) return; /* worker will put sk for us */ sock_put(sk); } static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } void __mptcp_set_connected(struct sock *sk) { if (sk->sk_state == TCP_SYN_SENT) { inet_sk_state_store(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); } } static void mptcp_set_connected(struct sock *sk) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_set_connected(sk); else set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags); mptcp_data_unlock(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) return; mptcp_propagate_sndbuf(parent, sk); subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); mptcp_get_options(sk, skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); mptcp_do_fallback(sk); pr_fallback(mptcp_sk(subflow->conn)); goto fallback; } if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); if (mp_opt.deny_join_id0) WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; pr_debug("subflow=%p, remote_key=%llu", subflow, subflow->remote_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); mptcp_set_connected(parent); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } if (!mptcp_finish_join(sk)) goto do_reset; subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); if (subflow_use_different_dport(mptcp_sk(parent), sk)) { pr_debug("synack inet_dport=%d %d", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); } } else if (mptcp_check_fallback(sk)) { fallback: mptcp_rcv_space_init(mptcp_sk(parent), sk); mptcp_set_connected(parent); } return; do_reset: subflow->reset_transient = 0; mptcp_subflow_reset(sk); } static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: tcp_listendrop(sk); return 0; } static void subflow_v4_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp_request_sock_ops.destructor(req); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; static struct proto tcpv6_prot_override __ro_after_init; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void subflow_v6_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp6_request_sock_ops.destructor(req); } #endif struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { if (ops->family == AF_INET) ops = &mptcp_subflow_v4_request_sock_ops; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ops->family == AF_INET6) ops = &mptcp_subflow_v6_request_sock_ops; #endif return inet_reqsk_alloc(ops, sk_listener, attach_listener); } EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ static bool subflow_hmac_valid(const struct request_sock *req, const struct mptcp_options_received *mp_opt) { const struct mptcp_subflow_request_sock *subflow_req; u8 hmac[SHA256_DIGEST_SIZE]; struct mptcp_sock *msk; subflow_req = mptcp_subflow_rsk(req); msk = subflow_req->msk; if (!msk) return false; subflow_generate_hmac(msk->remote_key, msk->local_key, subflow_req->remote_nonce, subflow_req->local_nonce, hmac); return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } static void mptcp_sock_destruct(struct sock *sk) { /* if new mptcp socket isn't accepted, it is free'd * from the tcp listener sockets request queue, linked * from req->sk. The tcp socket is released. * This calls the ULP release function which will * also remove the mptcp socket, via * sock_put(ctx->conn). * * Problem is that the mptcp socket will be in * ESTABLISHED state and will not have the SOCK_DEAD flag. * Both result in warnings from inet_sock_destruct. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { sk->sk_state = TCP_CLOSE; WARN_ON_ONCE(sk->sk_socket); sock_orphan(sk); } mptcp_destroy_common(mptcp_sk(sk)); inet_sock_destruct(sk); } static void mptcp_force_close(struct sock *sk) { /* the msk is not yet exposed to user-space */ inet_sk_state_store(sk, TCP_CLOSE); sk_common_release(sk); } static void subflow_ulp_fallback(struct sock *sk, struct mptcp_subflow_context *old_ctx) { struct inet_connection_sock *icsk = inet_csk(sk); mptcp_subflow_tcp_fallback(sk, old_ctx); icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; mptcp_subflow_ops_undo_override(sk); } static void subflow_drop_ctx(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); if (!ctx) return; subflow_ulp_fallback(ssk, ctx); if (ctx->conn) sock_put(ctx->conn); kfree_rcu(ctx, rcu); } void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = mptcp_sk(subflow->conn); subflow->remote_key = mp_opt->sndr_key; subflow->fully_established = 1; subflow->can_ack = 1; WRITE_ONCE(msk->fully_established, true); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; struct sock *new_msk = NULL; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed */ mp_opt.suboptions = 0; /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; fallback = !tcp_rsk(req)->is_mptcp; if (fallback) goto create_child; /* if the sk is MP_CAPABLE, we try to fetch the client key */ if (subflow_req->mp_capable) { /* we can receive and accept an in-window, out-of-order pkt, * which may not carry the MP_CAPABLE opt even on mptcp enabled * paths: always try to extract the peer key, and fallback * for packets missing it. * Even OoO DSS packets coming legitly after dropped or * reordered MPC will cause fallback, but we don't have other * options. */ mptcp_get_options(sk, skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { fallback = true; goto create_child; } new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req); if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(sk, skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; } } create_child: child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); tcp_rsk(req)->drop_req = false; /* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status. */ if (!ctx || fallback) { if (fallback_is_fatal) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } subflow_drop_ctx(child); goto out; } /* ssk inherits options of listener sk */ ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space */ inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); /* record the newly created socket as the first msk * subflow, but don't link it yet into conn_list */ WRITE_ONCE(mptcp_sk(new_msk)->first, child); /* new mpc subflow takes ownership of the newly * created mptcp socket */ new_msk->sk_destruct = mptcp_sock_destruct; mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq; mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; new_msk = NULL; /* with OoO packets we can reach here without ingress * mpc option */ if (mp_opt.suboptions & OPTIONS_MPTCP_MPC) mptcp_subflow_fully_established(ctx, &mp_opt); } else if (ctx->mp_join) { struct mptcp_sock *owner; owner = subflow_req->msk; if (!owner) { subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { pr_debug("ack inet_sport=%d %d", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } if (!mptcp_finish_join(child)) goto dispose_child; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; } } out: /* dispose of the left over mptcp master, if any */ if (unlikely(new_msk)) mptcp_force_close(new_msk); /* check for expected invariant - should never trigger, just help * catching eariler subtle bugs */ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || !mptcp_subflow_ctx(child)->conn)); return child; dispose_child: subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); req->rsk_ops->send_reset(sk, skb); /* The last child reference will be released by the caller */ return child; } static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; static struct proto tcp_prot_override __ro_after_init; enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", ssn, subflow->map_subflow_seq, subflow->map_data_len); } static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned int skb_consumed; skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; if (WARN_ON_ONCE(skb_consumed >= skb->len)) return true; return skb->len - skb_consumed <= subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); } static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; if (unlikely(before(ssn, subflow->map_subflow_seq))) { /* Mapping covers data later in the subflow stream, * currently unsupported. */ dbg_bad_map(subflow, ssn); return false; } if (unlikely(!before(ssn, subflow->map_subflow_seq + subflow->map_data_len))) { /* Mapping does covers past subflow data, invalid */ dbg_bad_map(subflow, ssn); return false; } return true; } static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, bool csum_reqd) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 offset, seq, delta; __sum16 csum; int len; if (!csum_reqd) return MAPPING_OK; /* mapping already validated on previous traversal */ if (subflow->map_csum_len == subflow->map_data_len) return MAPPING_OK; /* traverse the receive queue, ensuring it contains a full * DSS mapping and accumulating the related csum. * Preserve the accoumlate csum across multiple calls, to compute * the csum only once */ delta = subflow->map_data_len - subflow->map_csum_len; for (;;) { seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; offset = seq - TCP_SKB_CB(skb)->seq; /* if the current skb has not been accounted yet, csum its contents * up to the amount covered by the current DSS */ if (offset < skb->len) { __wsum csum; len = min(skb->len - offset, delta); csum = skb_checksum(skb, offset, len, 0); subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, subflow->map_csum_len); delta -= len; subflow->map_csum_len += len; } if (delta == 0) break; if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { /* if this subflow is closed, the partial mapping * will be never completed; flush the pending skbs, so * that subflow_sched_work_if_closed() can kick in */ if (unlikely(ssk->sk_state == TCP_CLOSE)) while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); /* not enough data to validate the csum */ return MAPPING_EMPTY; } /* the DSS mapping for next skbs will be validated later, * when a get_mapping_status call will process such skb */ skb = skb->next; } /* note that 'map_data_len' accounts only for the carried data, does * not include the eventual seq increment due to the data fin, * while the pseudo header requires the original DSS data len, * including that */ csum = __mptcp_make_csum(subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len + subflow->map_data_fin, subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); if (subflow->mp_join || subflow->valid_csum_seen) { subflow->send_mp_fail = 1; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); } return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; } subflow->valid_csum_seen = 1; return MAPPING_OK; } static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; u64 map_seq; skb = skb_peek(&ssk->sk_receive_queue); if (!skb) return MAPPING_EMPTY; if (mptcp_check_fallback(ssk)) return MAPPING_DUMMY; mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { /* the TCP stack deliver 0 len FIN pkt to the receive * queue, that is the only 0len pkts ever expected here, * and we can admit no mapping only for 0 len pkts */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) WARN_ONCE(1, "0len seq %d:%d flags %x", TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, TCP_SKB_CB(skb)->tcp_flags); sk_eat_skb(ssk, skb); return MAPPING_EMPTY; } if (!subflow->map_valid) return MAPPING_INVALID; goto validate_seq; } trace_get_mapping_status(mpext); data_len = mpext->data_len; if (data_len == 0) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); return MAPPING_INVALID; } if (mpext->data_fin == 1) { if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping * has been fully consumed. Continue * handling the existing mapping. */ skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } else { if (updated && schedule_work(&msk->work)) sock_hold((struct sock *)msk); return MAPPING_DATA_FIN; } } else { u64 data_fin_seq = mpext->data_seq + data_len - 1; /* If mpext->data_seq is a 32-bit value, data_fin_seq * must also be limited to 32 bits. */ if (!mpext->dsn64) data_fin_seq &= GENMASK_ULL(31, 0); mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", data_fin_seq, mpext->dsn64); } /* Adjust for DATA_FIN using 1 byte of sequence space */ data_len--; } map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && subflow->map_data_len == data_len && subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); goto validate_csum; } /* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported */ if (skb_is_fully_mapped(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID; } /* will validate the next map after consuming the current one */ goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; subflow->map_csum_reqd = mpext->csum_reqd; subflow->map_csum_len = 0; subflow->map_data_csum = csum_unfold(mpext->csum); /* Cfr RFC 8684 Section 3.3.0 */ if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ if (!validate_mapping(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; } skb_ext_del(skb, SKB_EXT_MPTCP); validate_csum: return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, u64 limit) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; u32 incr; incr = limit >= skb->len ? skb->len + fin : limit; pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; } /* sched mptcp worker to remove the subflow if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; if (likely(ssk->sk_state != TCP_CLOSE)) return; if (skb_queue_empty(&ssk->sk_receive_queue) && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) { sock_hold(sk); if (!schedule_work(&msk->work)) sock_put(sk); } } static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) { struct mptcp_sock *msk = mptcp_sk(subflow->conn); if (subflow->mp_join) return false; else if (READ_ONCE(msk->csum_enabled)) return !subflow->valid_csum_seen; else return !subflow->fully_established; } static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); enum mapping_status status; struct mptcp_sock *msk; struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) WRITE_ONCE(subflow->data_avail, 0); if (subflow->data_avail) return true; msk = mptcp_sk(subflow->conn); for (;;) { u64 ack_seq; u64 old_ack; status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID)) goto fallback; if (unlikely(status == MAPPING_DUMMY)) goto fallback; if (status != MAPPING_OK) goto no_data; skb = skb_peek(&ssk->sk_receive_queue); if (WARN_ON_ONCE(!skb)) goto no_data; /* if msk lacks the remote key, this subflow must provide an * MP_CAPABLE-based mapping */ if (unlikely(!READ_ONCE(msk->can_ack))) { if (!subflow->mpc_map) goto fallback; WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->ack_seq, subflow->map_seq); WRITE_ONCE(msk->can_ack, true); } old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); continue; } WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); break; } return true; no_data: subflow_sched_work_if_closed(msk, ssk); return false; fallback: /* RFC 8684 section 3.7. */ if (subflow->send_mp_fail) { if (mptcp_has_another_subflow(ssk)) { while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); } ssk->sk_err = EBADMSG; tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; tcp_send_active_reset(ssk, GFP_ATOMIC); WRITE_ONCE(subflow->data_avail, 0); return true; } if (!subflow_can_fallback(subflow)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ ssk->sk_err = EBADMSG; tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); WRITE_ONCE(subflow->data_avail, 0); return false; } __mptcp_do_fallback(msk); skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); return true; } bool mptcp_subflow_data_available(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* check if current mapping is still valid */ if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, 0); pr_debug("Done with mapping: seq=%u data_len=%u", subflow->map_subflow_seq, subflow->map_data_len); } return subflow_check_data_avail(sk); } /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, * not the ssk one. * * In mptcp, rwin is about the mptcp-level connection data. * * Data that is still on the ssk rx queue can thus be ignored, * as far as mptcp peer is concerned that data is still inflight. * DSS ACK is updated when skb is moved to the mptcp rx queue. */ void mptcp_space(const struct sock *ssk, int *space, int *full_space) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); *full_space = tcp_full_space(sk); } void __mptcp_error_report(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int err = sock_error(ssk); int ssk_state; if (!err) continue; /* only propagate errors on fallen-back sockets or * on MPC connect */ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) continue; /* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly * destroy the msk as needed. */ ssk_state = inet_sk_state_load(ssk); if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) inet_sk_state_store(sk, ssk_state); sk->sk_err = -err; /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); sk_error_report(sk); break; } } static void subflow_error_report(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; /* bail early if this is a no-op, so that we avoid introducing a * problematic lockdep dependency between TCP accept queue lock * and msk socket spinlock */ if (!sk->sk_socket) return; mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags); mptcp_data_unlock(sk); } static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); u16 state = 1 << inet_sk_state_load(sk); struct sock *parent = subflow->conn; struct mptcp_sock *msk; msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { /* MPJ subflow are removed from accept queue before reaching here, * avoid stray wakeups */ if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) return; set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; } WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && !subflow->mp_join && !(state & TCPF_CLOSE)); if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); else if (unlikely(sk->sk_err)) subflow_error_report(sk); } static void subflow_write_space(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; mptcp_propagate_sndbuf(sk, ssk); mptcp_write_space(sk); } static const struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_family == AF_INET6) return &subflow_v6_specific; #endif return &subflow_specific; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock_af_ops *target; target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) return; subflow->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = target; } #endif void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family) { memset(addr, 0, sizeof(*addr)); addr->ss_family = family; if (addr->ss_family == AF_INET) { struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; if (info->family == AF_INET) in_addr->sin_addr = info->addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ipv6_addr_v4mapped(&info->addr6)) in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; #endif in_addr->sin_port = info->port; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; if (info->family == AF_INET) ipv6_addr_set_v4mapped(info->addr.s_addr, &in6_addr->sin6_addr); else in6_addr->sin6_addr = info->addr6; in6_addr->sin6_port = info->port; } #endif } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct sockaddr_storage addr; int remote_id = remote->id; int local_id = loc->id; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; int ifindex; u8 flags; int err; if (!mptcp_is_fully_established(sk)) return -ENOTCONN; err = mptcp_subflow_create_socket(sk, &sf); if (err) return err; ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); do { get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); if (!local_id) { err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); if (err < 0) goto failed; local_id = err; } mptcp_pm_get_flags_and_ifindex_by_id(sock_net(sk), local_id, &flags, &ifindex); subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; mptcp_info2sockaddr(loc, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) goto failed; mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; subflow->local_id = local_id; subflow->remote_id = remote_id; subflow->request_join = 1; subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); mptcp_info2sockaddr(remote, &addr, ssk->sk_family); mptcp_add_pending_subflow(msk, subflow); mptcp_sockopt_sync(msk, ssk); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) goto failed_unlink; /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); return err; failed_unlink: spin_lock_bh(&msk->join_list_lock); list_del(&subflow->node); spin_unlock_bh(&msk->join_list_lock); sock_put(mptcp_subflow_tcp_sock(subflow)); failed: subflow->disposable = 1; sock_release(sf); return err; } static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) { #ifdef CONFIG_SOCK_CGROUP_DATA struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, *child_skcd = &child->sk_cgrp_data; /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != cgroup_id(sock_cgroup_ptr(child_skcd))) { #ifdef CONFIG_MEMCG struct mem_cgroup *memcg = parent->sk_memcg; mem_cgroup_sk_free(child); if (memcg && css_tryget(&memcg->css)) child->sk_memcg = memcg; #endif /* CONFIG_MEMCG */ cgroup_sk_free(child_skcd); *child_skcd = *parent_skcd; cgroup_sk_clone(child_skcd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ } static void mptcp_subflow_ops_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot) ssk->sk_prot = &tcpv6_prot_override; else #endif ssk->sk_prot = &tcp_prot_override; } static void mptcp_subflow_ops_undo_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot_override) ssk->sk_prot = &tcpv6_prot; else #endif ssk->sk_prot = &tcp_prot; } int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; struct net *net = sock_net(sk); struct socket *sf; int err; /* un-accepted server sockets can reach here - on bad configuration * bail early to avoid greater trouble later */ if (unlikely(!sk->sk_socket)) return -EINVAL; err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) return err; lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. */ sf->sk->sk_net_refcnt = 1; get_net(net); #ifdef CONFIG_PROC_FS this_cpu_add(*net->core.sock_inuse, 1); #endif err = tcp_set_ulp(sf->sk, "mptcp"); release_sock(sf->sk); if (err) { sock_release(sf); return err; } /* the newly created socket really belongs to the owning MPTCP master * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interaces really show this one belonging to the correct * user. */ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); pr_debug("subflow=%p", subflow); *new_sock = sf; sock_hold(sk); subflow->conn = sk; mptcp_subflow_ops_override(sf->sk); return 0; } static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; ctx = kzalloc(sizeof(*ctx), priority); if (!ctx) return NULL; rcu_assign_pointer(icsk->icsk_ulp_data, ctx); INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); pr_debug("subflow=%p", ctx); ctx->tcp_sock = sk; return ctx; } static void __subflow_state_change(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static bool subflow_is_done(const struct sock *sk) { return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; } static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; __subflow_state_change(sk); if (subflow_simultaneous_connect(sk)) { mptcp_propagate_sndbuf(parent, sk); mptcp_do_fallback(sk); mptcp_rcv_space_init(mptcp_sk(parent), sk); pr_fallback(mptcp_sk(parent)); subflow->conn_finished = 1; mptcp_set_connected(parent); } /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); else if (unlikely(sk->sk_err)) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); if (__mptcp_check_fallback(mptcp_sk(parent)) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; mptcp_subflow_eof(parent); } } static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; struct tcp_sock *tp = tcp_sk(sk); int err = 0; /* disallow attaching ULP to a socket unless it has been * created with sock_create_kern() */ if (!sk->sk_kern_sock) { err = -EOPNOTSUPP; goto out; } ctx = subflow_create_ctx(sk, GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto out; } pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); ctx->tcp_data_ready = sk->sk_data_ready; ctx->tcp_state_change = sk->sk_state_change; ctx->tcp_write_space = sk->sk_write_space; ctx->tcp_error_report = sk->sk_error_report; sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; sk->sk_error_report = subflow_error_report; out: return err; } static void subflow_ulp_release(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); bool release = true; struct sock *sk; if (!ctx) return; sk = ctx->conn; if (sk) { /* if the msk has been orphaned, keep the ctx * alive, will be freed by __mptcp_close_ssk(), * when the subflow is still unaccepted */ release = ctx->disposable || list_empty(&ctx->node); sock_put(sk); } mptcp_subflow_ops_undo_override(ssk); if (release) kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *new_ctx; if (!tcp_rsk(req)->is_mptcp || (!subflow_req->mp_capable && !subflow_req->mp_join)) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx = subflow_create_ctx(newsk, priority); if (!new_ctx) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_write_space = old_ctx->tcp_write_space; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; new_ctx->tcp_sock = newsk; if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection * is fully established only after we receive the remote key */ new_ctx->mp_capable = 1; new_ctx->local_key = subflow_req->local_key; new_ctx->token = subflow_req->token; new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->idsn = subflow_req->idsn; } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; new_ctx->fully_established = 1; new_ctx->backup = subflow_req->backup; new_ctx->local_id = subflow_req->local_id; new_ctx->remote_id = subflow_req->remote_id; new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; } } static void tcp_release_cb_override(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (mptcp_subflow_has_delegated_action(subflow)) mptcp_subflow_process_delegated(ssk); tcp_release_cb(ssk); } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, .init = subflow_ulp_init, .release = subflow_ulp_release, .clone = subflow_ulp_clone, }; static int subflow_ops_init(struct request_sock_ops *subflow_ops) { subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, subflow_ops->obj_size, 0, SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, NULL); if (!subflow_ops->slab) return -ENOMEM; return 0; } void __init mptcp_subflow_init(void) { mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v4 request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock * structures for v4 and v6 have the same size. It should not changed in * the future but better to make sure to be warned if it is no longer * the case. */ BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v6 request sock ops\n"); subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; subflow_v6m_specific.send_check = ipv4_specific.send_check; subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.net_frag_header_len = 0; tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; #endif mptcp_diag_subflow_init(&subflow_ulp_ops); if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); }
727 728 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Credential hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/cred.h> #include <linux/lsm_hooks.h> #include "common.h" #include "cred.h" #include "ruleset.h" #include "setup.h" static int hook_cred_prepare(struct cred *const new, const struct cred *const old, const gfp_t gfp) { struct landlock_ruleset *const old_dom = landlock_cred(old)->domain; if (old_dom) { landlock_get_ruleset(old_dom); landlock_cred(new)->domain = old_dom; } return 0; } static void hook_cred_free(struct cred *const cred) { struct landlock_ruleset *const dom = landlock_cred(cred)->domain; if (dom) landlock_put_ruleset_deferred(dom); } static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), LSM_HOOK_INIT(cred_free, hook_cred_free), }; __init void landlock_add_cred_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), LANDLOCK_NAME); }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> * * Based on the original implementation which is: * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * * Parts of the original code have been moved to arch/x86/vdso/vma.c * * This file implements vsyscall emulation. vsyscalls are a legacy ABI: * Userspace can request certain kernel services by calling fixed * addresses. This concept is problematic: * * - It interferes with ASLR. * - It's awkward to write code that lives in kernel addresses but is * callable by userspace at fixed addresses. * - The whole concept is impossible for 32-bit compat userspace. * - UML cannot easily virtualize a vsyscall. * * As of mid-2014, I believe that there is no new userspace code that * will use a vsyscall if the vDSO is present. I hope that there will * soon be no new userspace code that will ever use a vsyscall. * * The code in this file emulates vsyscalls when notified of a page * fault to a vsyscall address. */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/mm_types.h> #include <linux/syscalls.h> #include <linux/ratelimit.h> #include <asm/vsyscall.h> #include <asm/unistd.h> #include <asm/fixmap.h> #include <asm/traps.h> #include <asm/paravirt.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) XONLY; #else EMULATE; #endif static int __init vsyscall_setup(char *str) { if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; else if (!strcmp("xonly", str)) vsyscall_mode = XONLY; else if (!strcmp("none", str)) vsyscall_mode = NONE; else return -EINVAL; return 0; } return -EINVAL; } early_param("vsyscall", vsyscall_setup); static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { if (!show_unhandled_signals) return; printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) { int nr; if ((addr & ~0xC00UL) != VSYSCALL_ADDR) return -EINVAL; nr = (addr & 0xC00UL) >> 10; if (nr >= 3) return -EINVAL; return nr; } static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* * XXX: if access_ok, get_user, and put_user handled * sig_on_uaccess_err, this could go away. */ if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = &current->thread; thread->error_code = X86_PF_USER | X86_PF_WRITE; thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr); return false; } else { return true; } } bool emulate_vsyscall(unsigned long error_code, struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr, syscall_nr, tmp; int prev_sig_on_uaccess_err; long ret; unsigned long orig_dx; /* Write faults or kernel-privilege faults never get fixed up. */ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) return false; if (!(error_code & X86_PF_INSTR)) { /* Failed vsyscall read */ if (vsyscall_mode == EMULATE) return false; /* * User code tried and failed to read the vsyscall page. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); return false; } /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. */ WARN_ON_ONCE(address != regs->ip); if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); return false; } vsyscall_nr = addr_to_vsyscall_nr(address); trace_emulate_vsyscall(vsyscall_nr); if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { warn_bad_vsyscall(KERN_WARNING, regs, "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } tsk = current; /* * Check for access_ok violations and find the syscall nr. * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ switch (vsyscall_nr) { case 0: if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) || !write_ok_or_segv(regs->si, sizeof(struct timezone))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_gettimeofday; break; case 1: if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_time; break; case 2: if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || !write_ok_or_segv(regs->si, sizeof(unsigned))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_getcpu; break; } /* * Handle seccomp. regs->ip must be the original value. * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst. * * We could optimize the seccomp disabled case, but performance * here doesn't matter. */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; tmp = secure_computing(); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); force_exit_sig(SIGSYS); return true; } regs->orig_ax = -1; if (tmp) goto do_ret; /* skip requested */ /* * With a real vsyscall, page faults cause SIGSEGV. We want to * preserve that behavior to make writing exploits harder. */ prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; current->thread.sig_on_uaccess_err = 1; ret = -EFAULT; switch (vsyscall_nr) { case 0: /* this decodes regs->di and regs->si on its own */ ret = __x64_sys_gettimeofday(regs); break; case 1: /* this decodes regs->di on its own */ ret = __x64_sys_time(regs); break; case 2: /* while we could clobber regs->dx, we didn't in the past... */ orig_dx = regs->dx; regs->dx = 0; /* this decodes regs->di, regs->si and regs->dx on its own */ ret = __x64_sys_getcpu(regs); regs->dx = orig_dx; break; } current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); /* * If we failed to generate a signal for any reason, * generate one here. (This should be impossible.) */ if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && !sigismember(&tsk->pending.signal, SIGSEGV))) goto sigsegv; return true; /* Don't emulate the ret. */ } regs->ax = ret; do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; return true; sigsegv: force_sig(SIGSEGV); return true; } /* * A pseudo VMA to allow ptrace access for the vsyscall page. This only * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ static const char *gate_vma_name(struct vm_area_struct *vma) { return "[vsyscall]"; } static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; static struct vm_area_struct gate_vma __ro_after_init = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, .vm_flags = VM_READ | VM_EXEC, .vm_ops = &gate_vma_ops, }; struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_COMPAT if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL)) return NULL; #endif if (vsyscall_mode == NONE) return NULL; return &gate_vma; } int in_gate_area(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; return (addr >= vma->vm_start) && (addr < vma->vm_end); } /* * Use this when you have no reliable mm, typically from interrupt * context. It is less reliable than using a task's mm and may give * false positives. */ int in_gate_area_no_mm(unsigned long addr) { return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } /* * The VSYSCALL page is the only user-accessible page in the kernel address * range. Normally, the kernel page tables can have _PAGE_USER clear, but * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls * are enabled. * * Some day we may create a "minimal" vsyscall mode in which we emulate * vsyscalls but leave the page not present. If so, we skip calling * this. */ void __init set_vsyscall_pgtable_user_bits(pgd_t *root) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); #if CONFIG_PGTABLE_LEVELS >= 5 set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); #endif pud = pud_offset(p4d, VSYSCALL_ADDR); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); pmd = pmd_offset(pud, VSYSCALL_ADDR); set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); } void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); /* * For full emulation, the page needs to exist for real. In * execute-only mode, there is no PTE at all backing the vsyscall * page. */ if (vsyscall_mode == EMULATE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } if (vsyscall_mode == XONLY) gate_vma.vm_flags = VM_EXEC; BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); }
37 37 1 19 9 1 11 1 36 1 18 36 9 6 37 19 35 19 16 41 5 1 36 9 29 1 39 6 6 3 9 92 53 1 13 28 24 24 22 2 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2007 Alan Stern * Copyright (C) IBM Corporation, 2009 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> * * Thanks to Ingo Molnar for his many suggestions. * * Authors: Alan Stern <stern@rowland.harvard.edu> * K.Prasad <prasad@linux.vnet.ibm.com> * Frederic Weisbecker <fweisbec@gmail.com> */ /* * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, * using the CPU's debug registers. * This file contains the arch-independent routines. */ #include <linux/irqflags.h> #include <linux/kallsyms.h> #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/bug.h> #include <linux/hw_breakpoint.h> /* * Constraints data */ struct bp_cpuinfo { /* Number of pinned cpu breakpoints in a cpu */ unsigned int cpu_pinned; /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ unsigned int *tsk_pinned; /* Number of non-pinned cpu/task breakpoints in a cpu */ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); static int nr_slots[TYPE_MAX]; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { return per_cpu_ptr(bp_cpuinfo + type, cpu); } /* Keep track of the breakpoints attached to tasks */ static LIST_HEAD(bp_task_head); static int constraints_initialized; /* Gather the number of total pinned and un-pinned bp in a cpuset */ struct bp_busy_slots { unsigned int pinned; unsigned int flexible; }; /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); __weak int hw_breakpoint_weight(struct perf_event *bp) { return 1; } static inline enum bp_type_idx find_slot_idx(u64 bp_type) { if (bp_type & HW_BREAKPOINT_RW) return TYPE_DATA; return TYPE_INST; } /* * Report the maximum number of pinned breakpoints a task * have in this cpu */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; for (i = nr_slots[type] - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) return i + 1; } return 0; } /* * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { struct task_struct *tsk = bp->hw.target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { if (iter->hw.target == tsk && find_slot_idx(iter->attr.bp_type) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } return count; } static const struct cpumask *cpumask_of_bp(struct perf_event *bp) { if (bp->cpu >= 0) return cpumask_of(bp->cpu); return cpu_possible_mask; } /* * Report the number of pinned/un-pinned breakpoints we have in * a given cpu (cpu > -1) or in all of them (cpu = -1). */ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, enum bp_type_idx type) { const struct cpumask *cpumask = cpumask_of_bp(bp); int cpu; for_each_cpu(cpu, cpumask) { struct bp_cpuinfo *info = get_bp_info(cpu, type); int nr; nr = info->cpu_pinned; if (!bp->hw.target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); if (nr > slots->pinned) slots->pinned = nr; nr = info->flexible; if (nr > slots->flexible) slots->flexible = nr; } } /* * For now, continue to consider flexible as pinned, until we can * ensure no flexible event can ever be scheduled before a pinned event * in a same cpu. */ static void fetch_this_slot(struct bp_busy_slots *slots, int weight) { slots->pinned += weight; } /* * Add a pinned breakpoint for the given task in our constraint table */ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, enum bp_type_idx type, int weight) { unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int old_idx, new_idx; old_idx = task_bp_pinned(cpu, bp, type) - 1; new_idx = old_idx + weight; if (old_idx >= 0) tsk_pinned[old_idx]--; if (new_idx >= 0) tsk_pinned[new_idx]++; } /* * Add/remove the given breakpoint in our constraint table */ static void toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { const struct cpumask *cpumask = cpumask_of_bp(bp); int cpu; if (!enable) weight = -weight; /* Pinned counter cpu profiling */ if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } /* Pinned counter task profiling */ for_each_cpu(cpu, cpumask) toggle_bp_task_slot(bp, cpu, type, weight); if (enable) list_add_tail(&bp->hw.bp_list, &bp_task_head); else list_del(&bp->hw.bp_list); } __weak int arch_reserve_bp_slot(struct perf_event *bp) { return 0; } __weak void arch_release_bp_slot(struct perf_event *bp) { } /* * Function to perform processor-specific cleanup during unregistration */ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) { /* * A weak stub function here for those archs that don't define * it inside arch/.../kernel/hw_breakpoint.c */ } /* * Constraints to check before allowing this new breakpoint counter: * * == Non-pinned counter == (Considered as pinned for now) * * - If attached to a single cpu, check: * * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu) * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. * Otherwise, we check that the maximum number of per task * breakpoints (for this cpu) plus the number of per cpu breakpoint * (for this cpu) doesn't cover every registers. * * - If attached to every cpus, check: * * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks * breakpoints. * * * == Pinned counter == * * - If attached to a single cpu, check: * * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu) * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM * * -> Same checks as before. But now the info->flexible, if any, must keep * one register at least (or they will never be fed). * * - If attached to every cpus, check: * * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { struct bp_busy_slots slots = {0}; enum bp_type_idx type; int weight; int ret; /* We couldn't initialize breakpoint constraints on boot */ if (!constraints_initialized) return -ENOMEM; /* Basic checks */ if (bp_type == HW_BREAKPOINT_EMPTY || bp_type == HW_BREAKPOINT_INVALID) return -EINVAL; type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); /* * Simulate the addition of this breakpoint to the constraints * and see the result. */ fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ if (slots.pinned + (!!slots.flexible) > nr_slots[type]) return -ENOSPC; ret = arch_reserve_bp_slot(bp); if (ret) return ret; toggle_bp_slot(bp, true, type, weight); return 0; } int reserve_bp_slot(struct perf_event *bp) { int ret; mutex_lock(&nr_bp_mutex); ret = __reserve_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); return ret; } static void __release_bp_slot(struct perf_event *bp, u64 bp_type) { enum bp_type_idx type; int weight; arch_release_bp_slot(bp); type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); } void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); } static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int err; __release_bp_slot(bp, old_type); err = __reserve_bp_slot(bp, new_type); if (err) { /* * Reserve the old_type slot back in case * there's no space for the new type. * * This must succeed, because we just released * the old_type slot in the __release_bp_slot * call above. If not, something is broken. */ WARN_ON(__reserve_bp_slot(bp, old_type)); } return err; } static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int ret; mutex_lock(&nr_bp_mutex); ret = __modify_bp_slot(bp, old_type, new_type); mutex_unlock(&nr_bp_mutex); return ret; } /* * Allow the kernel debugger to reserve breakpoint slots without * taking a lock using the dbg_* variant of for the reserve and * release breakpoint slots. */ int dbg_reserve_bp_slot(struct perf_event *bp) { if (mutex_is_locked(&nr_bp_mutex)) return -1; return __reserve_bp_slot(bp, bp->attr.bp_type); } int dbg_release_bp_slot(struct perf_event *bp) { if (mutex_is_locked(&nr_bp_mutex)) return -1; __release_bp_slot(bp, bp->attr.bp_type); return 0; } static int hw_breakpoint_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { int err; err = hw_breakpoint_arch_parse(bp, attr, hw); if (err) return err; if (arch_check_bp_in_kernelspace(hw)) { if (attr->exclude_kernel) return -EINVAL; /* * Don't let unprivileged users set a breakpoint in the trap * path to avoid trap recursion attacks. */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; } return 0; } int register_perf_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint hw = { }; int err; err = reserve_bp_slot(bp); if (err) return err; err = hw_breakpoint_parse(bp, &bp->attr, &hw); if (err) { release_bp_slot(bp); return err; } bp->hw.info = hw; return 0; } /** * register_user_hw_breakpoint - register a hardware breakpoint for user space * @attr: breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint * @context: context data could be used in the triggered callback * @tsk: pointer to 'task_struct' of the process to which the address belongs */ struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, struct task_struct *tsk) { return perf_event_create_kernel_counter(attr, -1, tsk, triggered, context); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); static void hw_breakpoint_copy_attr(struct perf_event_attr *to, struct perf_event_attr *from) { to->bp_addr = from->bp_addr; to->bp_type = from->bp_type; to->bp_len = from->bp_len; to->disabled = from->disabled; } int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { struct arch_hw_breakpoint hw = { }; int err; err = hw_breakpoint_parse(bp, attr, &hw); if (err) return err; if (check) { struct perf_event_attr old_attr; old_attr = bp->attr; hw_breakpoint_copy_attr(&old_attr, attr); if (memcmp(&old_attr, attr, sizeof(*attr))) return -EINVAL; } if (bp->attr.bp_type != attr->bp_type) { err = modify_bp_slot(bp, bp->attr.bp_type, attr->bp_type); if (err) return err; } hw_breakpoint_copy_attr(&bp->attr, attr); bp->hw.info = hw; return 0; } /** * modify_user_hw_breakpoint - modify a user-space hardware breakpoint * @bp: the breakpoint structure to modify * @attr: new breakpoint attributes */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { int err; /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. * So call the function directly after making sure we are targeting the * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) perf_event_disable_local(bp); else perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, false); if (!bp->attr.disabled) perf_event_enable(bp); return err; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); /** * unregister_hw_breakpoint - unregister a user-space hardware breakpoint * @bp: the breakpoint structure to unregister */ void unregister_hw_breakpoint(struct perf_event *bp) { if (!bp) return; perf_event_release_kernel(bp); } EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); /** * register_wide_hw_breakpoint - register a wide breakpoint in the kernel * @attr: breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint * @context: context data could be used in the triggered callback * * @return a set of per_cpu pointers to perf events */ struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context) { struct perf_event * __percpu *cpu_events, *bp; long err = 0; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) return (void __percpu __force *)ERR_PTR(-ENOMEM); cpus_read_lock(); for_each_online_cpu(cpu) { bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered, context); if (IS_ERR(bp)) { err = PTR_ERR(bp); break; } per_cpu(*cpu_events, cpu) = bp; } cpus_read_unlock(); if (likely(!err)) return cpu_events; unregister_wide_hw_breakpoint(cpu_events); return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); /** * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel * @cpu_events: the per cpu set of events to unregister */ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; for_each_possible_cpu(cpu) unregister_hw_breakpoint(per_cpu(*cpu_events, cpu)); free_percpu(cpu_events); } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ .priority = 0x7fffffff }; static void bp_perf_event_destroy(struct perf_event *event) { release_bp_slot(event); } static int hw_breakpoint_event_init(struct perf_event *bp) { int err; if (bp->attr.type != PERF_TYPE_BREAKPOINT) return -ENOENT; /* * no branch sampling for breakpoint events */ if (has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); if (err) return err; bp->destroy = bp_perf_event_destroy; return 0; } static int hw_breakpoint_add(struct perf_event *bp, int flags) { if (!(flags & PERF_EF_START)) bp->hw.state = PERF_HES_STOPPED; if (is_sampling_event(bp)) { bp->hw.last_period = bp->hw.sample_period; perf_swevent_set_period(bp); } return arch_install_hw_breakpoint(bp); } static void hw_breakpoint_del(struct perf_event *bp, int flags) { arch_uninstall_hw_breakpoint(bp); } static void hw_breakpoint_start(struct perf_event *bp, int flags) { bp->hw.state = 0; } static void hw_breakpoint_stop(struct perf_event *bp, int flags) { bp->hw.state = PERF_HES_STOPPED; } static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ .event_init = hw_breakpoint_event_init, .add = hw_breakpoint_add, .del = hw_breakpoint_del, .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, }; int __init init_hw_breakpoint(void) { int cpu, err_cpu; int i; for (i = 0; i < TYPE_MAX; i++) nr_slots[i] = hw_breakpoint_slots(i); for_each_possible_cpu(cpu) { for (i = 0; i < TYPE_MAX; i++) { struct bp_cpuinfo *info = get_bp_info(cpu, i); info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), GFP_KERNEL); if (!info->tsk_pinned) goto err_alloc; } } constraints_initialized = 1; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); err_alloc: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) kfree(get_bp_info(err_cpu, i)->tsk_pinned); if (err_cpu == cpu) break; } return -ENOMEM; }
6748 728 6720 832 356 4014 4623 2162 3265 3973 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor contexts used to associate "labels" to objects. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #ifndef __AA_CONTEXT_H #define __AA_CONTEXT_H #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> #include "label.h" #include "policy_ns.h" #include "task.h" static inline struct aa_label *cred_label(const struct cred *cred) { struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred; AA_BUG(!blob); return *blob; } static inline void set_cred_label(const struct cred *cred, struct aa_label *label) { struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred; AA_BUG(!blob); *blob = label; } /** * aa_cred_raw_label - obtain cred's label * @cred: cred to obtain label from (NOT NULL) * * Returns: confining label * * does NOT increment reference count */ static inline struct aa_label *aa_cred_raw_label(const struct cred *cred) { struct aa_label *label = cred_label(cred); AA_BUG(!label); return label; } /** * aa_get_newest_cred_label - obtain the newest label on a cred * @cred: cred to obtain label from (NOT NULL) * * Returns: newest version of confining label */ static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred) { return aa_get_newest_label(aa_cred_raw_label(cred)); } /** * __aa_task_raw_label - retrieve another task's label * @task: task to query (NOT NULL) * * Returns: @task's label without incrementing its ref count * * If @task != current needs to be called in RCU safe critical section */ static inline struct aa_label *__aa_task_raw_label(struct task_struct *task) { return aa_cred_raw_label(__task_cred(task)); } /** * aa_current_raw_label - find the current tasks confining label * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * * This fn will not update the tasks cred to the most up to date version * of the label so it is safe to call when inside of locks. */ static inline struct aa_label *aa_current_raw_label(void) { return aa_cred_raw_label(current_cred()); } /** * aa_get_current_label - get the newest version of the current tasks label * * Returns: newest version of confining label (NOT NULL) * * This fn will not update the tasks cred, so it is safe inside of locks * * The returned reference must be put with aa_put_label() */ static inline struct aa_label *aa_get_current_label(void) { struct aa_label *l = aa_current_raw_label(); if (label_is_stale(l)) return aa_get_newest_label(l); return aa_get_label(l); } #define __end_current_label_crit_section(X) end_current_label_crit_section(X) /** * end_label_crit_section - put a reference found with begin_current_label.. * @label: label reference to put * * Should only be used with a reference obtained with * begin_current_label_crit_section and never used in situations where the * task cred may be updated */ static inline void end_current_label_crit_section(struct aa_label *label) { if (label != aa_current_raw_label()) aa_put_label(label); } /** * __begin_current_label_crit_section - current's confining label * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * * safe to call inside locks * * The returned reference must be put with __end_current_label_crit_section() * This must NOT be used if the task cred could be updated within the * critical section between __begin_current_label_crit_section() .. * __end_current_label_crit_section() */ static inline struct aa_label *__begin_current_label_crit_section(void) { struct aa_label *label = aa_current_raw_label(); if (label_is_stale(label)) label = aa_get_newest_label(label); return label; } /** * begin_current_label_crit_section - current's confining label and update it * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * * Not safe to call inside locks * * The returned reference must be put with end_current_label_crit_section() * This must NOT be used if the task cred could be updated within the * critical section between begin_current_label_crit_section() .. * end_current_label_crit_section() */ static inline struct aa_label *begin_current_label_crit_section(void) { struct aa_label *label = aa_current_raw_label(); might_sleep(); if (label_is_stale(label)) { label = aa_get_newest_label(label); if (aa_replace_current_label(label) == 0) /* task cred will keep the reference */ aa_put_label(label); } return label; } static inline struct aa_ns *aa_get_current_ns(void) { struct aa_label *label; struct aa_ns *ns; label = __begin_current_label_crit_section(); ns = aa_get_ns(labels_ns(label)); __end_current_label_crit_section(label); return ns; } #endif /* __AA_CONTEXT_H */
5 5 1 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #ifndef _LINUX_SKMSG_H #define _LINUX_SKMSG_H #include <linux/bpf.h> #include <linux/filter.h> #include <linux/scatterlist.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp.h> #include <net/strparser.h> #define MAX_MSG_FRAGS MAX_SKB_FRAGS #define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, __SK_NONE, }; struct sk_msg_sg { u32 start; u32 curr; u32 end; u32 size; u32 copybreak; unsigned long copy; /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the * chaining; * 2) to chain tailer SG entries after the message. */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; static_assert(BITS_PER_LONG >= NR_MSG_FRAG_IDS); /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; void *data_end; u32 apply_bytes; u32 cork_bytes; u32 flags; struct sk_buff *skb; struct sock *sk_redir; struct sock *sk; struct list_head list; }; struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, }; struct sk_psock_link { struct list_head list; struct bpf_map *map; void *link_raw; }; struct sk_psock_work_state { struct sk_buff *skb; u32 len; u32 off; }; struct sk_psock { struct sock *sk; struct sock *sk_redir; u32 apply_bytes; u32 cork_bytes; u32 eval; bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; refcount_t refcnt; void (*saved_unhash)(struct sock *sk); void (*saved_destroy)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len); void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); int sk_msg_free(struct sock *sk, struct sk_msg *msg); int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { WARN_ON(i == msg->sg.end && bytes); } static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) { if (psock->apply_bytes) { if (psock->apply_bytes < bytes) psock->apply_bytes = 0; else psock->apply_bytes -= bytes; } } static inline u32 sk_msg_iter_dist(u32 start, u32 end) { return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start); } #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ var = NR_MSG_FRAG_IDS - 1; \ else \ var--; \ } while (0) #define sk_msg_iter_var_next(var) \ do { \ var++; \ if (var == NR_MSG_FRAG_IDS) \ var = 0; \ } while (0) #define sk_msg_iter_prev(msg, which) \ sk_msg_iter_var_prev(msg->sg.which) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) static inline void sk_msg_clear_meta(struct sk_msg *msg) { memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy)); } static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); memset(msg, 0, sizeof(*msg)); sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, int which, u32 size) { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) { memcpy(dst, src, sizeof(*src)); sk_msg_init(src); } static inline bool sk_msg_full(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS; } static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; } static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) { return msg->sg.data[which]; } static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); } static inline bool sk_msg_to_ingress(const struct sk_msg *msg) { return msg->flags & BPF_F_INGRESS; } static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); if (test_bit(msg->sg.start, &msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { msg->data = sg_virt(sge); msg->data_end = msg->data + sge->length; } } static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, u32 len, u32 offset) { struct scatterlist *sge; get_page(page); sge = sk_msg_elem(msg, msg->sg.end); sg_set_page(sge, page, len, offset); sg_unmark_end(sge); __set_bit(msg->sg.end, &msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) __set_bit(i, &msg->sg.copy); else __clear_bit(i, &msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; } while (1); } static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, true); } static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, false); } static inline struct sk_psock *sk_psock(const struct sock *sk) { return __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_PSOCK); } static inline void sk_psock_set_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { set_bit(bit, &psock->state); } static inline void sk_psock_clear_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { clear_bit(bit, &psock->state); } static inline bool sk_psock_test_state(const struct sk_psock *psock, enum sk_psock_state_bits bit) { return test_bit(bit, &psock->state); } static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); kfree_skb(skb); } static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); else { sk_msg_free(psock->sk, msg); kfree(msg); } spin_unlock_bh(&psock->ingress_lock); } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); if (msg) list_del(&msg->list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, struct sk_msg *msg) { struct sk_msg *ret; spin_lock_bh(&psock->ingress_lock); if (list_is_last(&msg->list, &psock->ingress_msg)) ret = NULL; else ret = list_next_entry(msg, list); spin_unlock_bh(&psock->ingress_lock); return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) { return psock ? list_empty(&psock->ingress_msg) : true; } static inline void kfree_sk_msg(struct sk_msg *msg) { if (msg->skb) consume_skb(msg->skb); kfree(msg); } static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; sk->sk_err = err; sk_error_report(sk); } struct sk_psock *sk_psock_init(struct sock *sk, int node); void sk_psock_stop(struct sk_psock *psock); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); #else static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { return -EOPNOTSUPP; } static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { } static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { } #endif void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); static inline struct sk_psock_link *sk_psock_init_link(void) { return kzalloc(sizeof(struct sk_psock_link), GFP_ATOMIC | __GFP_NOWARN); } static inline void sk_psock_free_link(struct sk_psock_link *link) { kfree(link); } struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { sk_msg_free(psock->sk, psock->cork); kfree(psock->cork); psock->cork = NULL; } } static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, true); } static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock && !refcount_inc_not_zero(&psock->refcnt)) psock = NULL; rcu_read_unlock(); return psock; } void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) { if (refcount_dec_and_test(&psock->refcnt)) sk_psock_drop(sk, psock); } static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); } static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { prog = xchg(pprog, prog); if (prog) bpf_prog_put(prog); } static inline int psock_replace_prog(struct bpf_prog **pprog, struct bpf_prog *prog, struct bpf_prog *old) { if (cmpxchg(pprog, old, prog) != old) return -ENOENT; if (old) bpf_prog_put(old); return 0; } static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; return !!psock->saved_data_ready; } #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) /* We only have two bits so far. */ #define BPF_F_PTR_MASK ~(BPF_F_INGRESS | BPF_F_STRPARSER) static inline bool skb_bpf_strparser(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_STRPARSER; } static inline void skb_bpf_set_strparser(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_STRPARSER; } static inline bool skb_bpf_ingress(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_INGRESS; } static inline void skb_bpf_set_ingress(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_INGRESS; } static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, bool ingress) { skb->_sk_redir = (unsigned long)sk_redir; if (ingress) skb->_sk_redir |= BPF_F_INGRESS; } static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return (struct sock *)(sk_redir & BPF_F_PTR_MASK); } static inline void skb_bpf_redirect_clear(struct sk_buff *skb) { skb->_sk_redir = 0; } #endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */
1084 1084 1083 185 412 185 553 613 595 595 193 595 193 412 1 1 4 596 597 597 5 595 1 14 14 56 817 592 593 154 154 1085 1085 721 14 13 1 4 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "hard-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/kernel.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include "bat_v.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "log.h" #include "originator.h" #include "send.h" #include "soft-interface.h" #include "translation-table.h" /** * batadv_hardif_release() - release hard interface from lists and queue for * free after rcu grace period * @ref: kref pointer of the hard interface */ void batadv_hardif_release(struct kref *ref) { struct batadv_hard_iface *hard_iface; hard_iface = container_of(ref, struct batadv_hard_iface, refcount); dev_put(hard_iface->net_dev); kfree_rcu(hard_iface, rcu); } /** * batadv_hardif_get_by_netdev() - Get hard interface object of a net_device * @net_dev: net_device to search for * * Return: batadv_hard_iface of net_dev (with increased refcnt), NULL on errors */ struct batadv_hard_iface * batadv_hardif_get_by_netdev(const struct net_device *net_dev) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->net_dev == net_dev && kref_get_unless_zero(&hard_iface->refcount)) goto out; } hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } /** * batadv_getlink_net() - return link net namespace (of use fallback) * @netdev: net_device to check * @fallback_net: return in case get_link_net is not available for @netdev * * Return: result of rtnl_link_ops->get_link_net or @fallback_net */ static struct net *batadv_getlink_net(const struct net_device *netdev, struct net *fallback_net) { if (!netdev->rtnl_link_ops) return fallback_net; if (!netdev->rtnl_link_ops->get_link_net) return fallback_net; return netdev->rtnl_link_ops->get_link_net(netdev); } /** * batadv_mutual_parents() - check if two devices are each others parent * @dev1: 1st net dev * @net1: 1st devices netns * @dev2: 2nd net dev * @net2: 2nd devices netns * * veth devices come in pairs and each is the parent of the other! * * Return: true if the devices are each others parent, otherwise false */ static bool batadv_mutual_parents(const struct net_device *dev1, struct net *net1, const struct net_device *dev2, struct net *net2) { int dev1_parent_iflink = dev_get_iflink(dev1); int dev2_parent_iflink = dev_get_iflink(dev2); const struct net *dev1_parent_net; const struct net *dev2_parent_net; dev1_parent_net = batadv_getlink_net(dev1, net1); dev2_parent_net = batadv_getlink_net(dev2, net2); if (!dev1_parent_iflink || !dev2_parent_iflink) return false; return (dev1_parent_iflink == dev2->ifindex) && (dev2_parent_iflink == dev1->ifindex) && net_eq(dev1_parent_net, net2) && net_eq(dev2_parent_net, net1); } /** * batadv_is_on_batman_iface() - check if a device is a batman iface descendant * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it * is important to prevent this new interface from being used to create a new * mesh network (this behaviour would lead to a batman-over-batman * configuration). This function recursively checks all the fathers of the * device passed as argument looking for a batman-adv soft interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise */ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) { struct net *net = dev_net(net_dev); struct net_device *parent_dev; struct net *parent_net; int iflink; bool ret; /* check if this is a batman-adv mesh interface */ if (batadv_softif_is_valid(net_dev)) return true; iflink = dev_get_iflink(net_dev); if (iflink == 0) return false; parent_net = batadv_getlink_net(net_dev, net); /* iflink to itself, most likely physical device */ if (net == parent_net && iflink == net_dev->ifindex) return false; /* recurse over the parent device */ parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); /* if we got a NULL parent_dev there is something broken.. */ if (!parent_dev) { pr_err("Cannot find parent device\n"); return false; } if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net)) return false; ret = batadv_is_on_batman_iface(parent_dev); return ret; } static bool batadv_is_valid_iface(const struct net_device *net_dev) { if (net_dev->flags & IFF_LOOPBACK) return false; if (net_dev->type != ARPHRD_ETHER) return false; if (net_dev->addr_len != ETH_ALEN) return false; /* no batman over batman */ if (batadv_is_on_batman_iface(net_dev)) return false; return true; } /** * batadv_get_real_netdevice() - check if the given netdev struct is a virtual * interface on top of another 'real' interface * @netdev: the device to check * * Callers must hold the rtnl semaphore. You may want batadv_get_real_netdev() * instead of this. * * Return: the 'real' net device or the original net device and NULL in case * of an error. */ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *real_netdev = NULL; struct net *real_net; struct net *net; int iflink; ASSERT_RTNL(); if (!netdev) return NULL; iflink = dev_get_iflink(netdev); if (iflink == 0) { dev_hold(netdev); return netdev; } hard_iface = batadv_hardif_get_by_netdev(netdev); if (!hard_iface || !hard_iface->soft_iface) goto out; net = dev_net(hard_iface->soft_iface); real_net = batadv_getlink_net(netdev, net); /* iflink to itself, most likely physical device */ if (net == real_net && netdev->ifindex == iflink) { real_netdev = netdev; dev_hold(real_netdev); goto out; } real_netdev = dev_get_by_index(real_net, iflink); out: batadv_hardif_put(hard_iface); return real_netdev; } /** * batadv_get_real_netdev() - check if the given net_device struct is a virtual * interface on top of another 'real' interface * @net_device: the device to check * * Return: the 'real' net device or the original net device and NULL in case * of an error. */ struct net_device *batadv_get_real_netdev(struct net_device *net_device) { struct net_device *real_netdev; rtnl_lock(); real_netdev = batadv_get_real_netdevice(net_device); rtnl_unlock(); return real_netdev; } /** * batadv_is_wext_netdev() - check if the given net_device struct is a * wext wifi interface * @net_device: the device to check * * Return: true if the net device is a wext wireless device, false * otherwise. */ static bool batadv_is_wext_netdev(struct net_device *net_device) { if (!net_device) return false; #ifdef CONFIG_WIRELESS_EXT /* pre-cfg80211 drivers have to implement WEXT, so it is possible to * check for wireless_handlers != NULL */ if (net_device->wireless_handlers) return true; #endif return false; } /** * batadv_is_cfg80211_netdev() - check if the given net_device struct is a * cfg80211 wifi interface * @net_device: the device to check * * Return: true if the net device is a cfg80211 wireless device, false * otherwise. */ static bool batadv_is_cfg80211_netdev(struct net_device *net_device) { if (!net_device) return false; /* cfg80211 drivers have to set ieee80211_ptr */ if (net_device->ieee80211_ptr) return true; return false; } /** * batadv_wifi_flags_evaluate() - calculate wifi flags for net_device * @net_device: the device to check * * Return: batadv_hard_iface_wifi_flags flags of the device */ static u32 batadv_wifi_flags_evaluate(struct net_device *net_device) { u32 wifi_flags = 0; struct net_device *real_netdev; if (batadv_is_wext_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_WEXT_DIRECT; if (batadv_is_cfg80211_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; real_netdev = batadv_get_real_netdevice(net_device); if (!real_netdev) return wifi_flags; if (real_netdev == net_device) goto out; if (batadv_is_wext_netdev(real_netdev)) wifi_flags |= BATADV_HARDIF_WIFI_WEXT_INDIRECT; if (batadv_is_cfg80211_netdev(real_netdev)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT; out: dev_put(real_netdev); return wifi_flags; } /** * batadv_is_cfg80211_hardif() - check if the given hardif is a cfg80211 wifi * interface * @hard_iface: the device to check * * Return: true if the net device is a cfg80211 wireless device, false * otherwise. */ bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface) { u32 allowed_flags = 0; allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT; return !!(hard_iface->wifi_flags & allowed_flags); } /** * batadv_is_wifi_hardif() - check if the given hardif is a wifi interface * @hard_iface: the device to check * * Return: true if the net device is a 802.11 wireless device, false otherwise. */ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface) { if (!hard_iface) return false; return hard_iface->wifi_flags != 0; } /** * batadv_hardif_no_broadcast() - check whether (re)broadcast is necessary * @if_outgoing: the outgoing interface checked and considered for (re)broadcast * @orig_addr: the originator of this packet * @orig_neigh: originator address of the forwarder we just got the packet from * (NULL if we originated) * * Checks whether a packet needs to be (re)broadcasted on the given interface. * * Return: * BATADV_HARDIF_BCAST_NORECIPIENT: No neighbor on interface * BATADV_HARDIF_BCAST_DUPFWD: Just one neighbor, but it is the forwarder * BATADV_HARDIF_BCAST_DUPORIG: Just one neighbor, but it is the originator * BATADV_HARDIF_BCAST_OK: Several neighbors, must broadcast */ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, u8 *orig_addr, u8 *orig_neigh) { struct batadv_hardif_neigh_node *hardif_neigh; struct hlist_node *first; int ret = BATADV_HARDIF_BCAST_OK; rcu_read_lock(); /* 0 neighbors -> no (re)broadcast */ first = rcu_dereference(hlist_first_rcu(&if_outgoing->neigh_list)); if (!first) { ret = BATADV_HARDIF_BCAST_NORECIPIENT; goto out; } /* >1 neighbors -> (re)broadcast */ if (rcu_dereference(hlist_next_rcu(first))) goto out; hardif_neigh = hlist_entry(first, struct batadv_hardif_neigh_node, list); /* 1 neighbor, is the originator -> no rebroadcast */ if (orig_addr && batadv_compare_eth(hardif_neigh->orig, orig_addr)) { ret = BATADV_HARDIF_BCAST_DUPORIG; /* 1 neighbor, is the one we received from -> no rebroadcast */ } else if (orig_neigh && batadv_compare_eth(hardif_neigh->orig, orig_neigh)) { ret = BATADV_HARDIF_BCAST_DUPFWD; } out: rcu_read_unlock(); return ret; } static struct batadv_hard_iface * batadv_hardif_get_active(const struct net_device *soft_iface) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; if (hard_iface->if_status == BATADV_IF_ACTIVE && kref_get_unless_zero(&hard_iface->refcount)) goto out; } hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv, struct batadv_hard_iface *oldif) { struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; batadv_dat_init_own_addr(bat_priv, primary_if); batadv_bla_update_orig_address(bat_priv, primary_if, oldif); out: batadv_hardif_put(primary_if); } static void batadv_primary_if_select(struct batadv_priv *bat_priv, struct batadv_hard_iface *new_hard_iface) { struct batadv_hard_iface *curr_hard_iface; ASSERT_RTNL(); if (new_hard_iface) kref_get(&new_hard_iface->refcount); curr_hard_iface = rcu_replace_pointer(bat_priv->primary_if, new_hard_iface, 1); if (!new_hard_iface) goto out; bat_priv->algo_ops->iface.primary_set(new_hard_iface); batadv_primary_if_update_addr(bat_priv, curr_hard_iface); out: batadv_hardif_put(curr_hard_iface); } static bool batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface) { if (hard_iface->net_dev->flags & IFF_UP) return true; return false; } static void batadv_check_known_mac_addr(const struct net_device *net_dev) { const struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->net_dev == net_dev) continue; if (!batadv_compare_eth(hard_iface->net_dev->dev_addr, net_dev->dev_addr)) continue; pr_warn("The newly added mac address (%pM) already exists on: %s\n", net_dev->dev_addr, hard_iface->net_dev->name); pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n"); } rcu_read_unlock(); } /** * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom * @soft_iface: netdev struct of the mesh interface */ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) { const struct batadv_hard_iface *hard_iface; unsigned short lower_header_len = ETH_HLEN; unsigned short lower_headroom = 0; unsigned short lower_tailroom = 0; unsigned short needed_headroom; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) continue; if (hard_iface->soft_iface != soft_iface) continue; lower_header_len = max_t(unsigned short, lower_header_len, hard_iface->net_dev->hard_header_len); lower_headroom = max_t(unsigned short, lower_headroom, hard_iface->net_dev->needed_headroom); lower_tailroom = max_t(unsigned short, lower_tailroom, hard_iface->net_dev->needed_tailroom); } rcu_read_unlock(); needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); needed_headroom += batadv_max_header_len(); /* fragmentation headers don't strip the unicast/... header */ needed_headroom += sizeof(struct batadv_frag_packet); soft_iface->needed_headroom = needed_headroom; soft_iface->needed_tailroom = lower_tailroom; } /** * batadv_hardif_min_mtu() - Calculate maximum MTU for soft interface * @soft_iface: netdev struct of the soft interface * * Return: MTU for the soft-interface (limited by the minimal MTU of all active * slave interfaces) */ int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); const struct batadv_hard_iface *hard_iface; int min_mtu = INT_MAX; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->soft_iface != soft_iface) continue; min_mtu = min_t(int, hard_iface->net_dev->mtu, min_mtu); } rcu_read_unlock(); if (atomic_read(&bat_priv->fragmentation) == 0) goto out; /* with fragmentation enabled the maximum size of internally generated * packets such as translation table exchanges or tvlv containers, etc * has to be calculated */ min_mtu = min_t(int, min_mtu, BATADV_FRAG_MAX_FRAG_SIZE); min_mtu -= sizeof(struct batadv_frag_packet); min_mtu *= BATADV_FRAG_MAX_FRAGMENTS; out: /* report to the other components the maximum amount of bytes that * batman-adv can send over the wire (without considering the payload * overhead). For example, this value is used by TT to compute the * maximum local table size */ atomic_set(&bat_priv->packet_size_max, min_mtu); /* the real soft-interface MTU is computed by removing the payload * overhead from the maximum amount of bytes that was just computed. * * However batman-adv does not support MTUs bigger than ETH_DATA_LEN */ return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN); } /** * batadv_update_min_mtu() - Adjusts the MTU if a new interface with a smaller * MTU appeared * @soft_iface: netdev struct of the soft interface */ void batadv_update_min_mtu(struct net_device *soft_iface) { soft_iface->mtu = batadv_hardif_min_mtu(soft_iface); /* Check if the local translate table should be cleaned up to match a * new (and smaller) MTU. */ batadv_tt_local_resize_to_mtu(soft_iface); } static void batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; bat_priv = netdev_priv(hard_iface->soft_iface); bat_priv->algo_ops->iface.update_mac(hard_iface); hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED; /* the first active interface becomes our primary interface or * the next active interface after the old primary interface was removed */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) batadv_primary_if_select(bat_priv, hard_iface); batadv_info(hard_iface->soft_iface, "Interface activated: %s\n", hard_iface->net_dev->name); batadv_update_min_mtu(hard_iface->soft_iface); if (bat_priv->algo_ops->iface.activate) bat_priv->algo_ops->iface.activate(hard_iface); out: batadv_hardif_put(primary_if); } static void batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) return; hard_iface->if_status = BATADV_IF_INACTIVE; batadv_info(hard_iface->soft_iface, "Interface deactivated: %s\n", hard_iface->net_dev->name); batadv_update_min_mtu(hard_iface->soft_iface); } /** * batadv_hardif_enable_interface() - Enslave hard interface to soft interface * @hard_iface: hard interface to add to soft interface * @soft_iface: netdev struct of the mesh interface * * Return: 0 on success or negative error number in case of failure */ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct net_device *soft_iface) { struct batadv_priv *bat_priv; __be16 ethertype = htons(ETH_P_BATMAN); int max_header_len = batadv_max_header_len(); int ret; if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; kref_get(&hard_iface->refcount); dev_hold(soft_iface); hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface, NULL, NULL, NULL); if (ret) goto err_dev; ret = bat_priv->algo_ops->iface.enable(hard_iface); if (ret < 0) goto err_upper; hard_iface->if_status = BATADV_IF_INACTIVE; kref_get(&hard_iface->refcount); hard_iface->batman_adv_ptype.type = ethertype; hard_iface->batman_adv_ptype.func = batadv_batman_skb_recv; hard_iface->batman_adv_ptype.dev = hard_iface->net_dev; dev_add_pack(&hard_iface->batman_adv_ptype); batadv_info(hard_iface->soft_iface, "Adding interface: %s\n", hard_iface->net_dev->name); if (atomic_read(&bat_priv->fragmentation) && hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to %i would solve the problem.\n", hard_iface->net_dev->name, hard_iface->net_dev->mtu, ETH_DATA_LEN + max_header_len); if (!atomic_read(&bat_priv->fragmentation) && hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. If you experience problems getting traffic through try increasing the MTU to %i.\n", hard_iface->net_dev->name, hard_iface->net_dev->mtu, ETH_DATA_LEN + max_header_len); if (batadv_hardif_is_iface_up(hard_iface)) batadv_hardif_activate_interface(hard_iface); else batadv_err(hard_iface->soft_iface, "Not using interface %s (retrying later): interface not active\n", hard_iface->net_dev->name); batadv_hardif_recalc_extra_skbroom(soft_iface); if (bat_priv->algo_ops->iface.enabled) bat_priv->algo_ops->iface.enabled(hard_iface); out: return 0; err_upper: netdev_upper_dev_unlink(hard_iface->net_dev, soft_iface); err_dev: hard_iface->soft_iface = NULL; dev_put(soft_iface); batadv_hardif_put(hard_iface); return ret; } /** * batadv_hardif_cnt() - get number of interfaces enslaved to soft interface * @soft_iface: soft interface to check * * This function is only using RCU for locking - the result can therefore be * off when another function is modifying the list at the same time. The * caller can use the rtnl_lock to make sure that the count is accurate. * * Return: number of connected/enslaved hard interfaces */ static size_t batadv_hardif_cnt(const struct net_device *soft_iface) { struct batadv_hard_iface *hard_iface; size_t count = 0; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; count++; } rcu_read_unlock(); return count; } /** * batadv_hardif_disable_interface() - Remove hard interface from soft interface * @hard_iface: hard interface to be removed */ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if = NULL; batadv_hardif_deactivate_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; batadv_info(hard_iface->soft_iface, "Removing interface: %s\n", hard_iface->net_dev->name); dev_remove_pack(&hard_iface->batman_adv_ptype); batadv_hardif_put(hard_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { struct batadv_hard_iface *new_if; new_if = batadv_hardif_get_active(hard_iface->soft_iface); batadv_primary_if_select(bat_priv, new_if); batadv_hardif_put(new_if); } bat_priv->algo_ops->iface.disable(hard_iface); hard_iface->if_status = BATADV_IF_NOT_IN_USE; /* delete all references to this hard_iface */ batadv_purge_orig_ref(bat_priv); batadv_purge_outstanding_packets(bat_priv, hard_iface); dev_put(hard_iface->soft_iface); netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); /* nobody uses this interface anymore */ if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) batadv_gw_check_client_stop(bat_priv); hard_iface->soft_iface = NULL; batadv_hardif_put(hard_iface); out: batadv_hardif_put(primary_if); } static struct batadv_hard_iface * batadv_hardif_add_interface(struct net_device *net_dev) { struct batadv_hard_iface *hard_iface; ASSERT_RTNL(); if (!batadv_is_valid_iface(net_dev)) goto out; dev_hold(net_dev); hard_iface = kzalloc(sizeof(*hard_iface), GFP_ATOMIC); if (!hard_iface) goto release_dev; hard_iface->net_dev = net_dev; hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); mutex_init(&hard_iface->bat_iv.ogm_buff_mutex); spin_lock_init(&hard_iface->neigh_list_lock); kref_init(&hard_iface->refcount); hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT; hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev); if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; atomic_set(&hard_iface->hop_penalty, 0); batadv_v_hardif_init(hard_iface); batadv_check_known_mac_addr(hard_iface->net_dev); kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); batadv_hardif_generation++; return hard_iface; release_dev: dev_put(net_dev); out: return NULL; } static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) { ASSERT_RTNL(); /* first deactivate interface */ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) batadv_hardif_disable_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) return; hard_iface->if_status = BATADV_IF_TO_BE_REMOVED; batadv_hardif_put(hard_iface); } /** * batadv_hard_if_event_softif() - Handle events for soft interfaces * @event: NETDEV_* event to handle * @net_dev: net_device which generated an event * * Return: NOTIFY_* result */ static int batadv_hard_if_event_softif(unsigned long event, struct net_device *net_dev) { struct batadv_priv *bat_priv; switch (event) { case NETDEV_REGISTER: bat_priv = netdev_priv(net_dev); batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); break; } return NOTIFY_DONE; } static int batadv_hard_if_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *net_dev = netdev_notifier_info_to_dev(ptr); struct batadv_hard_iface *hard_iface; struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; if (batadv_softif_is_valid(net_dev)) return batadv_hard_if_event_softif(event, net_dev); hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface && (event == NETDEV_REGISTER || event == NETDEV_POST_TYPE_CHANGE)) hard_iface = batadv_hardif_add_interface(net_dev); if (!hard_iface) goto out; switch (event) { case NETDEV_UP: batadv_hardif_activate_interface(hard_iface); break; case NETDEV_GOING_DOWN: case NETDEV_DOWN: batadv_hardif_deactivate_interface(hard_iface); break; case NETDEV_UNREGISTER: case NETDEV_PRE_TYPE_CHANGE: list_del_rcu(&hard_iface->list); batadv_hardif_generation++; batadv_hardif_remove_interface(hard_iface); break; case NETDEV_CHANGEMTU: if (hard_iface->soft_iface) batadv_update_min_mtu(hard_iface->soft_iface); break; case NETDEV_CHANGEADDR: if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) goto hardif_put; batadv_check_known_mac_addr(hard_iface->net_dev); bat_priv = netdev_priv(hard_iface->soft_iface); bat_priv->algo_ops->iface.update_mac(hard_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto hardif_put; if (hard_iface == primary_if) batadv_primary_if_update_addr(bat_priv, NULL); break; case NETDEV_CHANGEUPPER: hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev); if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; break; default: break; } hardif_put: batadv_hardif_put(hard_iface); out: batadv_hardif_put(primary_if); return NOTIFY_DONE; } struct notifier_block batadv_hard_if_notifier = { .notifier_call = batadv_hard_if_event, };
823 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM kmem #if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KMEM_H #include <linux/types.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> DECLARE_EVENT_CLASS(kmem_alloc, TP_PROTO(unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags), TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( gfp_t, gfp_flags ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; __entry->bytes_req = bytes_req; __entry->bytes_alloc = bytes_alloc; __entry->gfp_flags = gfp_flags; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(kmem_alloc, kmalloc, TP_PROTO(unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags), TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags) ); DEFINE_EVENT(kmem_alloc, kmem_cache_alloc, TP_PROTO(unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags), TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags) ); DECLARE_EVENT_CLASS(kmem_alloc_node, TP_PROTO(unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags, int node), TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( gfp_t, gfp_flags ) __field( int, node ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; __entry->bytes_req = bytes_req; __entry->bytes_alloc = bytes_alloc; __entry->gfp_flags = gfp_flags; __entry->node = node; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d", (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), __entry->node) ); DEFINE_EVENT(kmem_alloc_node, kmalloc_node, TP_PROTO(unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags, int node), TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node) ); DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node, TP_PROTO(unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags, int node), TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node) ); TRACE_EVENT(kfree, TP_PROTO(unsigned long call_site, const void *ptr), TP_ARGS(call_site, ptr), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; ), TP_printk("call_site=%pS ptr=%p", (void *)__entry->call_site, __entry->ptr) ); TRACE_EVENT(kmem_cache_free, TP_PROTO(unsigned long call_site, const void *ptr, const char *name), TP_ARGS(call_site, ptr, name), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) __string( name, name ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; __assign_str(name, name); ), TP_printk("call_site=%pS ptr=%p name=%s", (void *)__entry->call_site, __entry->ptr, __get_str(name)) ); TRACE_EVENT(mm_page_free, TP_PROTO(struct page *page, unsigned int order), TP_ARGS(page, order), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->order = order; ), TP_printk("page=%p pfn=0x%lx order=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->order) ); TRACE_EVENT(mm_page_free_batched, TP_PROTO(struct page *page), TP_ARGS(page), TP_STRUCT__entry( __field( unsigned long, pfn ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); ), TP_printk("page=%p pfn=0x%lx order=0", pfn_to_page(__entry->pfn), __entry->pfn) ); TRACE_EVENT(mm_page_alloc, TP_PROTO(struct page *page, unsigned int order, gfp_t gfp_flags, int migratetype), TP_ARGS(page, order, gfp_flags, migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( gfp_t, gfp_flags ) __field( int, migratetype ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->gfp_flags = gfp_flags; __entry->migratetype = migratetype; ), TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s", __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, __entry->pfn != -1UL ? __entry->pfn : 0, __entry->order, __entry->migratetype, show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_page, TP_PROTO(struct page *page, unsigned int order, int migratetype), TP_ARGS(page, order, migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( int, migratetype ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->migratetype = migratetype; ), TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d", __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, __entry->pfn != -1UL ? __entry->pfn : 0, __entry->order, __entry->migratetype, __entry->order == 0) ); DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked, TP_PROTO(struct page *page, unsigned int order, int migratetype), TP_ARGS(page, order, migratetype) ); TRACE_EVENT(mm_page_pcpu_drain, TP_PROTO(struct page *page, unsigned int order, int migratetype), TP_ARGS(page, order, migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( int, migratetype ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->migratetype = migratetype; ), TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->order, __entry->migratetype) ); TRACE_EVENT(mm_page_alloc_extfrag, TP_PROTO(struct page *page, int alloc_order, int fallback_order, int alloc_migratetype, int fallback_migratetype), TP_ARGS(page, alloc_order, fallback_order, alloc_migratetype, fallback_migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( int, alloc_order ) __field( int, fallback_order ) __field( int, alloc_migratetype ) __field( int, fallback_migratetype ) __field( int, change_ownership ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->alloc_order = alloc_order; __entry->fallback_order = fallback_order; __entry->alloc_migratetype = alloc_migratetype; __entry->fallback_migratetype = fallback_migratetype; __entry->change_ownership = (alloc_migratetype == get_pageblock_migratetype(page)); ), TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->alloc_order, __entry->fallback_order, pageblock_order, __entry->alloc_migratetype, __entry->fallback_migratetype, __entry->fallback_order < pageblock_order, __entry->change_ownership) ); /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ #ifndef __PTR_TO_HASHVAL static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr) { int ret; unsigned long hashval; ret = ptr_to_hashval(ptr, &hashval); if (ret) return 0; /* The hashed value is only 32-bit */ return (unsigned int)hashval; } #define __PTR_TO_HASHVAL #endif #define TRACE_MM_PAGES \ EM(MM_FILEPAGES) \ EM(MM_ANONPAGES) \ EM(MM_SWAPENTS) \ EMe(MM_SHMEMPAGES) #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); TRACE_MM_PAGES #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, int member, long count), TP_ARGS(mm, member, count), TP_STRUCT__entry( __field(unsigned int, mm_id) __field(unsigned int, curr) __field(int, member) __field(long, size) ), TP_fast_assign( __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; __entry->size = (count << PAGE_SHIFT); ), TP_printk("mm_id=%u curr=%d type=%s size=%ldB", __entry->mm_id, __entry->curr, __print_symbolic(__entry->member, TRACE_MM_PAGES), __entry->size) ); #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
4987 728 67 67 8 92 462 193 1572 470 480 459 4 92 368 1132 193 550 853 1175 2723 711 2011 9 34 838 727 28 728 114 832 13 132 65 917 16 4 986 906 288 542 294 179 147 1215 417 150 26 177 135 182 3 187 16 63 135 15 29 4016 168 1162 167 2499 1297 24 497 377 109 162 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 /* SPDX-License-Identifier: GPL-2.0 */ /* * Linux Security Module Hook declarations. * * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com> * Copyright (C) 2001 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com> * Copyright (C) 2001 James Morris <jmorris@intercode.com.au> * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group) * Copyright (C) 2015 Intel Corporation. * Copyright (C) 2015 Casey Schaufler <casey@schaufler-ca.com> * Copyright (C) 2016 Mellanox Techonologies * Copyright (C) 2020 Google LLC. */ /* * The macro LSM_HOOK is used to define the data structures required by * the LSM framework using the pattern: * * LSM_HOOK(<return_type>, <default_value>, <hook_name>, args...) * * struct security_hook_heads { * #define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME; * #include <linux/lsm_hook_defs.h> * #undef LSM_HOOK * }; */ LSM_HOOK(int, 0, binder_set_context_mgr, const struct cred *mgr) LSM_HOOK(int, 0, binder_transaction, const struct cred *from, const struct cred *to) LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from, const struct cred *to) LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from, const struct cred *to, struct file *file) LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child, unsigned int mode) LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent) LSM_HOOK(int, 0, capget, struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) LSM_HOOK(int, 0, capset, struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) LSM_HOOK(int, 0, capable, const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, struct super_block *sb) LSM_HOOK(int, 0, quota_on, struct dentry *dentry) LSM_HOOK(int, 0, syslog, int type) LSM_HOOK(int, 0, settime, const struct timespec64 *ts, const struct timezone *tz) LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages) LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm) LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, struct file *file) LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm) LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc, struct fs_context *src_sc) LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc, struct fs_parameter *param) LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts) LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts) LSM_HOOK(int, 0, sb_mnt_opts_compat, struct super_block *sb, void *mnt_opts) LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts) LSM_HOOK(int, 0, sb_kern_mount, struct super_block *sb) LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb) LSM_HOOK(int, 0, sb_statfs, struct dentry *dentry) LSM_HOOK(int, 0, sb_mount, const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) LSM_HOOK(int, 0, sb_umount, struct vfsmount *mnt, int flags) LSM_HOOK(int, 0, sb_pivotroot, const struct path *old_path, const struct path *new_path) LSM_HOOK(int, 0, sb_set_mnt_opts, struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) LSM_HOOK(int, 0, sb_clone_mnt_opts, const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) LSM_HOOK(int, 0, sb_add_mnt_opt, const char *option, const char *val, int len, void **mnt_opts) LSM_HOOK(int, 0, move_mount, const struct path *from_path, const struct path *to_path) LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry, int mode, const struct qstr *name, void **ctx, u32 *ctxlen) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) #ifdef CONFIG_SECURITY_PATH LSM_HOOK(int, 0, path_unlink, const struct path *dir, struct dentry *dentry) LSM_HOOK(int, 0, path_mkdir, const struct path *dir, struct dentry *dentry, umode_t mode) LSM_HOOK(int, 0, path_rmdir, const struct path *dir, struct dentry *dentry) LSM_HOOK(int, 0, path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) LSM_HOOK(int, 0, path_truncate, const struct path *path) LSM_HOOK(int, 0, path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name) LSM_HOOK(int, 0, path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) LSM_HOOK(int, 0, path_rename, const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) LSM_HOOK(int, 0, path_chmod, const struct path *path, umode_t mode) LSM_HOOK(int, 0, path_chown, const struct path *path, kuid_t uid, kgid_t gid) LSM_HOOK(int, 0, path_chroot, const struct path *path) #endif /* CONFIG_SECURITY_PATH */ /* Needed for inode based security check */ LSM_HOOK(int, 0, path_notify, const struct path *path, u64 mask, unsigned int obj_type) LSM_HOOK(int, 0, inode_alloc_security, struct inode *inode) LSM_HOOK(void, LSM_RET_VOID, inode_free_security, struct inode *inode) LSM_HOOK(int, 0, inode_init_security, struct inode *inode, struct inode *dir, const struct qstr *qstr, const char **name, void **value, size_t *len) LSM_HOOK(int, 0, inode_init_security_anon, struct inode *inode, const struct qstr *name, const struct inode *context_inode) LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry, umode_t mode) LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) LSM_HOOK(int, 0, inode_unlink, struct inode *dir, struct dentry *dentry) LSM_HOOK(int, 0, inode_symlink, struct inode *dir, struct dentry *dentry, const char *old_name) LSM_HOOK(int, 0, inode_mkdir, struct inode *dir, struct dentry *dentry, umode_t mode) LSM_HOOK(int, 0, inode_rmdir, struct inode *dir, struct dentry *dentry) LSM_HOOK(int, 0, inode_mknod, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) LSM_HOOK(int, 0, inode_rename, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) LSM_HOOK(int, 0, inode_readlink, struct dentry *dentry) LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode, bool rcu) LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask) LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr) LSM_HOOK(int, 0, inode_getattr, const struct path *path) LSM_HOOK(int, 0, inode_setxattr, struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry) LSM_HOOK(int, 0, inode_removexattr, struct user_namespace *mnt_userns, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry) LSM_HOOK(int, 0, inode_killpriv, struct user_namespace *mnt_userns, struct dentry *dentry) LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct user_namespace *mnt_userns, struct inode *inode, const char *name, void **buffer, bool alloc) LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode, const char *name, const void *value, size_t size, int flags) LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer, size_t buffer_size) LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid) LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new) LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, const char *name) LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir, struct kernfs_node *kn) LSM_HOOK(int, 0, file_permission, struct file *file, int mask) LSM_HOOK(int, 0, file_alloc_security, struct file *file) LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file) LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd, unsigned long arg) LSM_HOOK(int, 0, mmap_addr, unsigned long addr) LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd) LSM_HOOK(int, 0, file_fcntl, struct file *file, unsigned int cmd, unsigned long arg) LSM_HOOK(void, LSM_RET_VOID, file_set_fowner, struct file *file) LSM_HOOK(int, 0, file_send_sigiotask, struct task_struct *tsk, struct fown_struct *fown, int sig) LSM_HOOK(int, 0, file_receive, struct file *file) LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, unsigned long clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) LSM_HOOK(int, 0, cred_prepare, struct cred *new, const struct cred *old, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_transfer, struct cred *new, const struct cred *old) LSM_HOOK(void, LSM_RET_VOID, cred_getsecid, const struct cred *c, u32 *secid) LSM_HOOK(int, 0, kernel_act_as, struct cred *new, u32 secid) LSM_HOOK(int, 0, kernel_create_files_as, struct cred *new, struct inode *inode) LSM_HOOK(int, 0, kernel_module_request, char *kmod_name) LSM_HOOK(int, 0, kernel_load_data, enum kernel_load_data_id id, bool contents) LSM_HOOK(int, 0, kernel_post_load_data, char *buf, loff_t size, enum kernel_load_data_id id, char *description) LSM_HOOK(int, 0, kernel_read_file, struct file *file, enum kernel_read_file_id id, bool contents) LSM_HOOK(int, 0, kernel_post_read_file, struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) LSM_HOOK(int, 0, task_fix_setuid, struct cred *new, const struct cred *old, int flags) LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old, int flags) LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid) LSM_HOOK(int, 0, task_getpgid, struct task_struct *p) LSM_HOOK(int, 0, task_getsid, struct task_struct *p) LSM_HOOK(void, LSM_RET_VOID, task_getsecid_subj, struct task_struct *p, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj, struct task_struct *p, u32 *secid) LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice) LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio) LSM_HOOK(int, 0, task_getioprio, struct task_struct *p) LSM_HOOK(int, 0, task_prlimit, const struct cred *cred, const struct cred *tcred, unsigned int flags) LSM_HOOK(int, 0, task_setrlimit, struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) LSM_HOOK(int, 0, task_setscheduler, struct task_struct *p) LSM_HOOK(int, 0, task_getscheduler, struct task_struct *p) LSM_HOOK(int, 0, task_movememory, struct task_struct *p) LSM_HOOK(int, 0, task_kill, struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p, struct inode *inode) LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag) LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp, u32 *secid) LSM_HOOK(int, 0, msg_msg_alloc_security, struct msg_msg *msg) LSM_HOOK(void, LSM_RET_VOID, msg_msg_free_security, struct msg_msg *msg) LSM_HOOK(int, 0, msg_queue_alloc_security, struct kern_ipc_perm *perm) LSM_HOOK(void, LSM_RET_VOID, msg_queue_free_security, struct kern_ipc_perm *perm) LSM_HOOK(int, 0, msg_queue_associate, struct kern_ipc_perm *perm, int msqflg) LSM_HOOK(int, 0, msg_queue_msgctl, struct kern_ipc_perm *perm, int cmd) LSM_HOOK(int, 0, msg_queue_msgsnd, struct kern_ipc_perm *perm, struct msg_msg *msg, int msqflg) LSM_HOOK(int, 0, msg_queue_msgrcv, struct kern_ipc_perm *perm, struct msg_msg *msg, struct task_struct *target, long type, int mode) LSM_HOOK(int, 0, shm_alloc_security, struct kern_ipc_perm *perm) LSM_HOOK(void, LSM_RET_VOID, shm_free_security, struct kern_ipc_perm *perm) LSM_HOOK(int, 0, shm_associate, struct kern_ipc_perm *perm, int shmflg) LSM_HOOK(int, 0, shm_shmctl, struct kern_ipc_perm *perm, int cmd) LSM_HOOK(int, 0, shm_shmat, struct kern_ipc_perm *perm, char __user *shmaddr, int shmflg) LSM_HOOK(int, 0, sem_alloc_security, struct kern_ipc_perm *perm) LSM_HOOK(void, LSM_RET_VOID, sem_free_security, struct kern_ipc_perm *perm) LSM_HOOK(int, 0, sem_associate, struct kern_ipc_perm *perm, int semflg) LSM_HOOK(int, 0, sem_semctl, struct kern_ipc_perm *perm, int cmd) LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops, unsigned nsops, int alter) LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb) LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry, struct inode *inode) LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) LSM_HOOK(int, 0, ismaclabel, const char *name) LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, u32 *seclen) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx, u32 *ctxlen) #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) #endif /* CONFIG_SECURITY && CONFIG_WATCH_QUEUE */ #if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS) LSM_HOOK(int, 0, watch_key, struct key *key) #endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other, struct sock *newsk) LSM_HOOK(int, 0, unix_may_send, struct socket *sock, struct socket *other) LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern) LSM_HOOK(int, 0, socket_post_create, struct socket *sock, int family, int type, int protocol, int kern) LSM_HOOK(int, 0, socket_socketpair, struct socket *socka, struct socket *sockb) LSM_HOOK(int, 0, socket_bind, struct socket *sock, struct sockaddr *address, int addrlen) LSM_HOOK(int, 0, socket_connect, struct socket *sock, struct sockaddr *address, int addrlen) LSM_HOOK(int, 0, socket_listen, struct socket *sock, int backlog) LSM_HOOK(int, 0, socket_accept, struct socket *sock, struct socket *newsock) LSM_HOOK(int, 0, socket_sendmsg, struct socket *sock, struct msghdr *msg, int size) LSM_HOOK(int, 0, socket_recvmsg, struct socket *sock, struct msghdr *msg, int size, int flags) LSM_HOOK(int, 0, socket_getsockname, struct socket *sock) LSM_HOOK(int, 0, socket_getpeername, struct socket *sock) LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how) LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb) LSM_HOOK(int, 0, socket_getpeersec_stream, struct socket *sock, char __user *optval, int __user *optlen, unsigned len) LSM_HOOK(int, 0, socket_getpeersec_dgram, struct socket *sock, struct sk_buff *skb, u32 *secid) LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority) LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk) LSM_HOOK(void, LSM_RET_VOID, sk_clone_security, const struct sock *sk, struct sock *newsk) LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, struct sock *sk, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, sock_graft, struct sock *sk, struct socket *parent) LSM_HOOK(int, 0, inet_conn_request, const struct sock *sk, struct sk_buff *skb, struct request_sock *req) LSM_HOOK(void, LSM_RET_VOID, inet_csk_clone, struct sock *newsk, const struct request_sock *req) LSM_HOOK(void, LSM_RET_VOID, inet_conn_established, struct sock *sk, struct sk_buff *skb) LSM_HOOK(int, 0, secmark_relabel_packet, u32 secid) LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void) LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void) LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req, struct flowi_common *flic) LSM_HOOK(int, 0, tun_dev_alloc_security, void **security) LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security) LSM_HOOK(int, 0, tun_dev_create, void) LSM_HOOK(int, 0, tun_dev_attach_queue, void *security) LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security) LSM_HOOK(int, 0, tun_dev_open, void *security) LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_endpoint *ep, struct sk_buff *skb) LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_endpoint *ep, struct sock *sk, struct sock *newsk) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND LSM_HOOK(int, 0, ib_pkey_access, void *sec, u64 subnet_prefix, u16 pkey) LSM_HOOK(int, 0, ib_endport_manage_subnet, void *sec, const char *dev_name, u8 port_num) LSM_HOOK(int, 0, ib_alloc_security, void **sec) LSM_HOOK(void, LSM_RET_VOID, ib_free_security, void *sec) #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK(int, 0, xfrm_policy_alloc_security, struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp) LSM_HOOK(int, 0, xfrm_policy_clone_security, struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx) LSM_HOOK(void, LSM_RET_VOID, xfrm_policy_free_security, struct xfrm_sec_ctx *ctx) LSM_HOOK(int, 0, xfrm_policy_delete_security, struct xfrm_sec_ctx *ctx) LSM_HOOK(int, 0, xfrm_state_alloc, struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) LSM_HOOK(int, 0, xfrm_state_alloc_acquire, struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) LSM_HOOK(void, LSM_RET_VOID, xfrm_state_free_security, struct xfrm_state *x) LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x) LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid) LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid, int ckall) #endif /* CONFIG_SECURITY_NETWORK_XFRM */ /* key management security hooks */ #ifdef CONFIG_KEYS LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred, unsigned long flags) LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key) LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **_buffer) #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr, void **lsmrule) LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule) LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule) LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) #ifdef CONFIG_PERF_EVENTS LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type) LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event) LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event) LSM_HOOK(int, 0, perf_event_read, struct perf_event *event) LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) #endif /* CONFIG_PERF_EVENTS */
8 8 8 24 24 24 20 15 7 6 7 1 6 8 8 8 8 8 8 7 8 15 15 15 15 13 15 15 14 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 4 14 22 3 1 15 1 1 24 1 20 2 17 1 1 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. * * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "ext4.h" #include <linux/fsmap.h> #include "fsmap.h" #include "mballoc.h" #include <linux/sort.h> #include <linux/list_sort.h> #include <trace/events/ext4.h> /* Convert an ext4_fsmap to an fsmap. */ void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, struct ext4_fsmap *src) { dest->fmr_device = src->fmr_device; dest->fmr_flags = src->fmr_flags; dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits; dest->fmr_owner = src->fmr_owner; dest->fmr_offset = 0; dest->fmr_length = src->fmr_length << sb->s_blocksize_bits; dest->fmr_reserved[0] = 0; dest->fmr_reserved[1] = 0; dest->fmr_reserved[2] = 0; } /* Convert an fsmap to an ext4_fsmap. */ void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, struct fsmap *src) { dest->fmr_device = src->fmr_device; dest->fmr_flags = src->fmr_flags; dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits; dest->fmr_owner = src->fmr_owner; dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits; } /* getfsmap query state */ struct ext4_getfsmap_info { struct ext4_fsmap_head *gfi_head; ext4_fsmap_format_t gfi_formatter; /* formatting fn */ void *gfi_format_arg;/* format buffer */ ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */ u32 gfi_dev; /* device id */ ext4_group_t gfi_agno; /* bg number, if applicable */ struct ext4_fsmap gfi_low; /* low rmap key */ struct ext4_fsmap gfi_high; /* high rmap key */ struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */ struct list_head gfi_meta_list; /* fixed metadata list */ bool gfi_last; /* last extent? */ }; /* Associate a device with a getfsmap handler. */ struct ext4_getfsmap_dev { int (*gfd_fn)(struct super_block *sb, struct ext4_fsmap *keys, struct ext4_getfsmap_info *info); u32 gfd_dev; }; /* Compare two getfsmap device handlers. */ static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) { const struct ext4_getfsmap_dev *d1 = p1; const struct ext4_getfsmap_dev *d2 = p2; return d1->gfd_dev - d2->gfd_dev; } /* Compare a record against our starting point */ static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, struct ext4_fsmap *rec) { return rec->fmr_physical < info->gfi_low.fmr_physical; } /* * Format a reverse mapping for getfsmap, having translated rm_startblock * into the appropriate daddr units. */ static int ext4_getfsmap_helper(struct super_block *sb, struct ext4_getfsmap_info *info, struct ext4_fsmap *rec) { struct ext4_fsmap fmr; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t rec_fsblk = rec->fmr_physical; ext4_group_t agno; ext4_grpblk_t cno; int error; if (fatal_signal_pending(current)) return -EINTR; /* * Filter out records that start before our startpoint, if the * caller requested that. */ if (ext4_getfsmap_rec_before_low_key(info, rec)) { rec_fsblk += rec->fmr_length; if (info->gfi_next_fsblk < rec_fsblk) info->gfi_next_fsblk = rec_fsblk; return EXT4_QUERY_RANGE_CONTINUE; } /* Are we just counting mappings? */ if (info->gfi_head->fmh_count == 0) { if (info->gfi_head->fmh_entries == UINT_MAX) return EXT4_QUERY_RANGE_ABORT; if (rec_fsblk > info->gfi_next_fsblk) info->gfi_head->fmh_entries++; if (info->gfi_last) return EXT4_QUERY_RANGE_CONTINUE; info->gfi_head->fmh_entries++; rec_fsblk += rec->fmr_length; if (info->gfi_next_fsblk < rec_fsblk) info->gfi_next_fsblk = rec_fsblk; return EXT4_QUERY_RANGE_CONTINUE; } /* * If the record starts past the last physical block we saw, * then we've found a gap. Report the gap as being owned by * whatever the caller specified is the missing owner. */ if (rec_fsblk > info->gfi_next_fsblk) { if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) return EXT4_QUERY_RANGE_ABORT; ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk, &agno, &cno); trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), rec_fsblk - info->gfi_next_fsblk, EXT4_FMR_OWN_UNKNOWN); fmr.fmr_device = info->gfi_dev; fmr.fmr_physical = info->gfi_next_fsblk; fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN; fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; error = info->gfi_formatter(&fmr, info->gfi_format_arg); if (error) return error; info->gfi_head->fmh_entries++; } if (info->gfi_last) goto out; /* Fill out the extent we found */ if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) return EXT4_QUERY_RANGE_ABORT; ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno); trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), rec->fmr_length, rec->fmr_owner); fmr.fmr_device = info->gfi_dev; fmr.fmr_physical = rec_fsblk; fmr.fmr_owner = rec->fmr_owner; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; fmr.fmr_length = rec->fmr_length; error = info->gfi_formatter(&fmr, info->gfi_format_arg); if (error) return error; info->gfi_head->fmh_entries++; out: rec_fsblk += rec->fmr_length; if (info->gfi_next_fsblk < rec_fsblk) info->gfi_next_fsblk = rec_fsblk; return EXT4_QUERY_RANGE_CONTINUE; } static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) { return fmr->fmr_physical + fmr->fmr_length; } /* Transform a blockgroup's free record into a fsmap */ static int ext4_getfsmap_datadev_helper(struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t len, void *priv) { struct ext4_fsmap irec; struct ext4_getfsmap_info *info = priv; struct ext4_fsmap *p; struct ext4_fsmap *tmp; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t fsb; ext4_fsblk_t fslen; int error; fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno)); fslen = EXT4_C2B(sbi, len); /* If the retained free extent record is set... */ if (info->gfi_lastfree.fmr_owner) { /* ...and abuts this one, lengthen it and return. */ if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) { info->gfi_lastfree.fmr_length += fslen; return 0; } /* * There's a gap between the two free extents; emit the * retained extent prior to merging the meta_list. */ error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); if (error) return error; info->gfi_lastfree.fmr_owner = 0; } /* Merge in any relevant extents from the meta_list */ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { list_del(&p->fmr_list); kfree(p); } else if (p->fmr_physical < fsb) { error = ext4_getfsmap_helper(sb, info, p); if (error) return error; list_del(&p->fmr_list); kfree(p); } } irec.fmr_device = 0; irec.fmr_physical = fsb; irec.fmr_length = fslen; irec.fmr_owner = EXT4_FMR_OWN_FREE; irec.fmr_flags = 0; /* If this is a free extent at the end of a bg, buffer it. */ if (ext4_fsmap_next_pblk(&irec) == ext4_group_first_block_no(sb, agno + 1)) { info->gfi_lastfree = irec; return 0; } /* Otherwise, emit it */ return ext4_getfsmap_helper(sb, info, &irec); } /* Execute a getfsmap query against the log device. */ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, struct ext4_getfsmap_info *info) { journal_t *journal = EXT4_SB(sb)->s_journal; struct ext4_fsmap irec; /* Set up search keys */ info->gfi_low = keys[0]; info->gfi_low.fmr_length = 0; memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0, info->gfi_low.fmr_physical, info->gfi_low.fmr_length, info->gfi_low.fmr_owner); trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0, info->gfi_high.fmr_physical, info->gfi_high.fmr_length, info->gfi_high.fmr_owner); if (keys[0].fmr_physical > 0) return 0; /* Fabricate an rmap entry for the external log device. */ irec.fmr_physical = journal->j_blk_offset; irec.fmr_length = journal->j_total_len; irec.fmr_owner = EXT4_FMR_OWN_LOG; irec.fmr_flags = 0; return ext4_getfsmap_helper(sb, info, &irec); } /* Helper to fill out an ext4_fsmap. */ static inline int ext4_getfsmap_fill(struct list_head *meta_list, ext4_fsblk_t fsb, ext4_fsblk_t len, uint64_t owner) { struct ext4_fsmap *fsm; fsm = kmalloc(sizeof(*fsm), GFP_NOFS); if (!fsm) return -ENOMEM; fsm->fmr_device = 0; fsm->fmr_flags = 0; fsm->fmr_physical = fsb; fsm->fmr_owner = owner; fsm->fmr_length = len; list_add_tail(&fsm->fmr_list, meta_list); return 0; } /* * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, ext4_group_t agno, struct list_head *meta_list) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno); ext4_fsblk_t len; unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb); int error; /* Record the superblock. */ if (ext4_bg_has_super(sb, agno)) { error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS); if (error) return error; fsb++; } /* Record the group descriptors. */ len = ext4_bg_num_gdb(sb, agno); if (!len) return 0; error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_GDT); if (error) return error; fsb += len; /* Reserved GDT blocks */ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_RESV_GDT); if (error) return error; } return 0; } /* Compare two fsmap items. */ static int ext4_getfsmap_compare(void *priv, const struct list_head *a, const struct list_head *b) { struct ext4_fsmap *fa; struct ext4_fsmap *fb; fa = container_of(a, struct ext4_fsmap, fmr_list); fb = container_of(b, struct ext4_fsmap, fmr_list); if (fa->fmr_physical < fb->fmr_physical) return -1; else if (fa->fmr_physical > fb->fmr_physical) return 1; return 0; } /* Merge adjacent extents of fixed metadata. */ static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list) { struct ext4_fsmap *p; struct ext4_fsmap *prev = NULL; struct ext4_fsmap *tmp; list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { if (!prev) { prev = p; continue; } if (prev->fmr_owner == p->fmr_owner && prev->fmr_physical + prev->fmr_length == p->fmr_physical) { prev->fmr_length += p->fmr_length; list_del(&p->fmr_list); kfree(p); } else prev = p; } } /* Free a list of fixed metadata. */ static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) { struct ext4_fsmap *p; struct ext4_fsmap *tmp; list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { list_del(&p->fmr_list); kfree(p); } } /* Find all the fixed metadata in the filesystem. */ static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, struct list_head *meta_list) { struct ext4_group_desc *gdp; ext4_group_t agno; int error; INIT_LIST_HEAD(meta_list); /* Collect everything. */ for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) { gdp = ext4_get_group_desc(sb, agno, NULL); if (!gdp) { error = -EFSCORRUPTED; goto err; } /* Superblock & GDT */ error = ext4_getfsmap_find_sb(sb, agno, meta_list); if (error) goto err; /* Block bitmap */ error = ext4_getfsmap_fill(meta_list, ext4_block_bitmap(sb, gdp), 1, EXT4_FMR_OWN_BLKBM); if (error) goto err; /* Inode bitmap */ error = ext4_getfsmap_fill(meta_list, ext4_inode_bitmap(sb, gdp), 1, EXT4_FMR_OWN_INOBM); if (error) goto err; /* Inodes */ error = ext4_getfsmap_fill(meta_list, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group, EXT4_FMR_OWN_INODES); if (error) goto err; } /* Sort the list */ list_sort(NULL, meta_list, ext4_getfsmap_compare); /* Merge adjacent extents */ ext4_getfsmap_merge_fixed_metadata(meta_list); return 0; err: ext4_getfsmap_free_fixed_metadata(meta_list); return error; } /* Execute a getfsmap query against the buddy bitmaps */ static int ext4_getfsmap_datadev(struct super_block *sb, struct ext4_fsmap *keys, struct ext4_getfsmap_info *info) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start_fsb; ext4_fsblk_t end_fsb; ext4_fsblk_t bofs; ext4_fsblk_t eofs; ext4_group_t start_ag; ext4_group_t end_ag; ext4_grpblk_t first_cluster; ext4_grpblk_t last_cluster; int error = 0; bofs = le32_to_cpu(sbi->s_es->s_first_data_block); eofs = ext4_blocks_count(sbi->s_es); if (keys[0].fmr_physical >= eofs) return 0; else if (keys[0].fmr_physical < bofs) keys[0].fmr_physical = bofs; if (keys[1].fmr_physical >= eofs) keys[1].fmr_physical = eofs - 1; if (keys[1].fmr_physical < keys[0].fmr_physical) return 0; start_fsb = keys[0].fmr_physical; end_fsb = keys[1].fmr_physical; /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster); ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster); /* * Convert the fsmap low/high keys to bg based keys. Initialize * low to the fsmap low key and max out the high key to the end * of the bg. */ info->gfi_low = keys[0]; info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster); info->gfi_low.fmr_length = 0; memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); /* Assemble a list of all the fixed-location metadata. */ error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list); if (error) goto err; /* Query each bg */ for (info->gfi_agno = start_ag; info->gfi_agno <= end_ag; info->gfi_agno++) { /* * Set the bg high key from the fsmap high key if this * is the last bg that we're querying. */ if (info->gfi_agno == end_ag) { info->gfi_high = keys[1]; info->gfi_high.fmr_physical = EXT4_C2B(sbi, last_cluster); info->gfi_high.fmr_length = 0; } trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno, info->gfi_low.fmr_physical, info->gfi_low.fmr_length, info->gfi_low.fmr_owner); trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno, info->gfi_high.fmr_physical, info->gfi_high.fmr_length, info->gfi_high.fmr_owner); error = ext4_mballoc_query_range(sb, info->gfi_agno, EXT4_B2C(sbi, info->gfi_low.fmr_physical), EXT4_B2C(sbi, info->gfi_high.fmr_physical), ext4_getfsmap_datadev_helper, info); if (error) goto err; /* * Set the bg low key to the start of the bg prior to * moving on to the next bg. */ if (info->gfi_agno == start_ag) memset(&info->gfi_low, 0, sizeof(info->gfi_low)); } /* Do we have a retained free extent? */ if (info->gfi_lastfree.fmr_owner) { error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); if (error) goto err; } /* Report any gaps at the end of the bg */ info->gfi_last = true; error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); if (error) goto err; err: ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list); return error; } /* Do we recognize the device? */ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, struct ext4_fsmap *fm) { if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; if (EXT4_SB(sb)->s_journal_bdev && fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev)) return true; return false; } /* Ensure that the low key is less than the high key. */ static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key, struct ext4_fsmap *high_key) { if (low_key->fmr_device > high_key->fmr_device) return false; if (low_key->fmr_device < high_key->fmr_device) return true; if (low_key->fmr_physical > high_key->fmr_physical) return false; if (low_key->fmr_physical < high_key->fmr_physical) return true; if (low_key->fmr_owner > high_key->fmr_owner) return false; if (low_key->fmr_owner < high_key->fmr_owner) return true; return false; } #define EXT4_GETFSMAP_DEVS 2 /* * Get filesystem's extents as described in head, and format for * output. Calls formatter to fill the user's buffer until all * extents are mapped, until the passed-in head->fmh_count slots have * been filled, or until the formatter short-circuits the loop, if it * is tracking filled-in extents on its own. * * Key to Confusion * ---------------- * There are multiple levels of keys and counters at work here: * _fsmap_head.fmh_keys -- low and high fsmap keys passed in; * these reflect fs-wide block addrs. * dkeys -- fmh_keys used to query each device; * these are fmh_keys but w/ the low key * bumped up by fmr_length. * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this * is how we detect gaps in the fsmap * records and report them. * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from * dkeys; used to query the free space. */ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, ext4_fsmap_format_t formatter, void *arg) { struct ext4_fsmap dkeys[2]; /* per-dev keys */ struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS]; struct ext4_getfsmap_info info = { NULL }; int i; int error = 0; if (head->fmh_iflags & ~FMH_IF_VALID) return -EINVAL; if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) || !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1])) return -EINVAL; head->fmh_entries = 0; /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; if (EXT4_SB(sb)->s_journal_bdev) { handlers[1].gfd_dev = new_encode_dev( EXT4_SB(sb)->s_journal_bdev->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev), ext4_getfsmap_dev_compare, NULL); /* * To continue where we left off, we allow userspace to use the * last mapping from a previous call as the low key of the next. * This is identified by a non-zero length in the low key. We * have to increment the low key in this scenario to ensure we * don't return the same mapping again, and instead return the * very next mapping. * * Bump the physical offset as there can be no other mapping for * the same physical block range. */ dkeys[0] = head->fmh_keys[0]; dkeys[0].fmr_physical += dkeys[0].fmr_length; dkeys[0].fmr_owner = 0; dkeys[0].fmr_length = 0; memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap)); if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) return -EINVAL; info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; info.gfi_formatter = formatter; info.gfi_format_arg = arg; info.gfi_head = head; /* For each device we support... */ for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) { /* Is this device within the range the user asked for? */ if (!handlers[i].gfd_fn) continue; if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev) continue; if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev) break; /* * If this device number matches the high key, we have * to pass the high key to the handler to limit the * query results. If the device number exceeds the * low key, zero out the low key so that we get * everything from the beginning. */ if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device) dkeys[1] = head->fmh_keys[1]; if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct ext4_fsmap)); info.gfi_dev = handlers[i].gfd_dev; info.gfi_last = false; info.gfi_agno = -1; error = handlers[i].gfd_fn(sb, dkeys, &info); if (error) break; info.gfi_next_fsblk = 0; } head->fmh_oflags = FMH_OF_DEV_T; return error; }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* AF_RXRPC internal definitions * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/atomic.h> #include <linux/seqlock.h> #include <linux/win_minmax.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include "protocol.h" #define FCRYPT_BSIZE 8 struct rxrpc_crypt { union { u8 x[FCRYPT_BSIZE]; __be32 n[2]; }; } __attribute__((aligned(8))); #define rxrpc_queue_work(WS) queue_work(rxrpc_workqueue, (WS)) #define rxrpc_queue_delayed_work(WS,D) \ queue_delayed_work(rxrpc_workqueue, (WS), (D)) struct key_preparsed_payload; struct rxrpc_connection; /* * Mark applied to socket buffers in skb->mark. skb->priority is used * to pass supplementary information. */ enum rxrpc_skb_mark { RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; /* * sk_state for RxRPC sockets */ enum { RXRPC_UNBOUND = 0, RXRPC_CLIENT_UNBOUND, /* Unbound socket used as client */ RXRPC_CLIENT_BOUND, /* client local address bound */ RXRPC_SERVER_BOUND, /* server local address bound */ RXRPC_SERVER_BOUND2, /* second server local address bound */ RXRPC_SERVER_LISTENING, /* server listening for connections */ RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */ RXRPC_CLOSE, /* socket is being closed */ }; /* * Per-network namespace data. */ struct rxrpc_net { struct proc_dir_entry *proc_net; /* Subdir in /proc/net */ u32 epoch; /* Local epoch for detecting local-end reset */ struct list_head calls; /* List of calls active in this namespace */ spinlock_t call_lock; /* Lock for ->calls */ atomic_t nr_calls; /* Count of allocated calls */ atomic_t nr_conns; struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ struct work_struct service_conn_reaper; struct timer_list service_conn_reap_timer; bool live; bool kill_all_client_conns; atomic_t nr_client_conns; spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */ struct list_head idle_client_conns; struct work_struct client_conn_reaper; struct timer_list client_conn_reap_timer; struct hlist_head local_endpoints; struct mutex local_mutex; /* Lock for ->local_endpoints */ DECLARE_HASHTABLE (peer_hash, 10); spinlock_t peer_hash_lock; /* Lock for ->peer_hash */ #define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */ u8 peer_keepalive_cursor; time64_t peer_keepalive_base; struct list_head peer_keepalive[32]; struct list_head peer_keepalive_new; struct timer_list peer_keepalive_timer; struct work_struct peer_keepalive_work; }; /* * Service backlog preallocation. * * This contains circular buffers of preallocated peers, connections and calls * for incoming service calls and their head and tail pointers. This allows * calls to be set up in the data_ready handler, thereby avoiding the need to * shuffle packets around so much. */ struct rxrpc_backlog { unsigned short peer_backlog_head; unsigned short peer_backlog_tail; unsigned short conn_backlog_head; unsigned short conn_backlog_tail; unsigned short call_backlog_head; unsigned short call_backlog_tail; #define RXRPC_BACKLOG_MAX 32 struct rxrpc_peer *peer_backlog[RXRPC_BACKLOG_MAX]; struct rxrpc_connection *conn_backlog[RXRPC_BACKLOG_MAX]; struct rxrpc_call *call_backlog[RXRPC_BACKLOG_MAX]; }; /* * RxRPC socket definition */ struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */ rwlock_t recvmsg_lock; /* Lock for recvmsg_q */ struct key *key; /* security for this socket */ struct key *securities; /* list of server security descriptors */ struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT bool exclusive; /* Exclusive connection for a client socket */ u16 second_service; /* Additional service bound to the endpoint */ struct { /* Service upgrade information */ u16 from; /* Service ID to upgrade (if not 0) */ u16 to; /* service ID to upgrade to */ } service_upgrade; sa_family_t family; /* Protocol family created with */ struct sockaddr_rxrpc srx; /* Primary Service/local addresses */ struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */ }; #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) /* * CPU-byteorder normalised Rx packet header. */ struct rxrpc_host_header { u32 epoch; /* client boot timestamp */ u32 cid; /* connection and channel ID */ u32 callNumber; /* call ID (0 for connection-level packets) */ u32 seq; /* sequence number of pkt in call stream */ u32 serial; /* serial number of pkt sent to network */ u8 type; /* packet type */ u8 flags; /* packet flags */ u8 userStatus; /* app-layer defined status */ u8 securityIndex; /* security protocol ID */ union { u16 _rsvd; /* reserved */ u16 cksum; /* kerberos security checksum */ }; u16 serviceId; /* service ID */ } __packed; /* * RxRPC socket buffer private variables * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { atomic_t nr_ring_pins; /* Number of rxtx ring pins */ u8 nr_subpackets; /* Number of subpackets */ u8 rx_flags; /* Received packet flags */ #define RXRPC_SKB_INCL_LAST 0x01 /* - Includes last packet */ #define RXRPC_SKB_TX_BUFFER 0x02 /* - Is transmit buffer */ union { int remain; /* amount of space remaining for next write */ /* List of requested ACKs on subpackets */ unsigned long rx_req_ack[(RXRPC_MAX_NR_JUMBO + BITS_PER_LONG - 1) / BITS_PER_LONG]; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; #define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) /* * RxRPC security module interface */ struct rxrpc_security { const char *name; /* name of this service */ u8 security_index; /* security type provided */ u32 no_key_abort; /* Abort code indicating no key */ /* Initialise a security service */ int (*init)(void); /* Clean up a security service */ void (*exit)(void); /* Parse the information from a server key */ int (*preparse_server_key)(struct key_preparsed_payload *); /* Clean up the preparse buffer after parsing a server key */ void (*free_preparse_server_key)(struct key_preparsed_payload *); /* Destroy the payload of a server key */ void (*destroy_server_key)(struct key *); /* Describe a server key */ void (*describe_server_key)(const struct key *, struct seq_file *); /* initialise a connection's security */ int (*init_connection_security)(struct rxrpc_connection *, struct rxrpc_key_token *); /* Work out how much data we can store in a packet, given an estimate * of the amount of data remaining. */ int (*how_much_data)(struct rxrpc_call *, size_t, size_t *, size_t *, size_t *); /* impose security on a packet */ int (*secure_packet)(struct rxrpc_call *, struct sk_buff *, size_t); /* verify the security on a received packet */ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, unsigned int, unsigned int, rxrpc_seq_t, u16); /* Free crypto request on a call */ void (*free_call_crypto)(struct rxrpc_call *); /* Locate the data in a received packet that has been verified. */ void (*locate_data)(struct rxrpc_call *, struct sk_buff *, unsigned int *, unsigned int *); /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); /* respond to a challenge */ int (*respond_to_challenge)(struct rxrpc_connection *, struct sk_buff *, u32 *); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, struct sk_buff *, u32 *); /* clear connection security */ void (*clear)(struct rxrpc_connection *); }; /* * RxRPC local transport endpoint description * - owned by a single AF_RXRPC socket * - pointed to by transport socket struct sk_user_data */ struct rxrpc_local { struct rcu_head rcu; atomic_t active_users; /* Number of users of the local endpoint */ refcount_t ref; /* Number of references to the structure */ struct rxrpc_net *rxnet; /* The network ns in which this resides */ struct hlist_node link; struct socket *socket; /* my UDP socket */ struct work_struct processor; struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ struct rb_root client_bundles; /* Client connection bundles by socket params */ spinlock_t client_bundles_lock; /* Lock for client_bundles */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ bool dead; bool service_closed; /* Service socket closed */ struct sockaddr_rxrpc srx; /* local address */ }; /* * RxRPC remote transport endpoint definition * - matched by local endpoint, remote port, address and protocol type */ struct rxrpc_peer { struct rcu_head rcu; /* This must be first */ refcount_t ref; unsigned long hash_key; struct hlist_node hash_link; struct rxrpc_local *local; struct hlist_head error_targets; /* targets for net error distribution */ struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ unsigned int if_mtu; /* interface MTU for this peer */ unsigned int mtu; /* network MTU for this peer */ unsigned int maxdata; /* data size (MTU - hdrsize) */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ struct sockaddr_rxrpc srx; /* remote address */ /* calculated RTT cache */ #define RXRPC_RTT_CACHE_SIZE 32 spinlock_t rtt_input_lock; /* RTT lock for input routine */ ktime_t rtt_last_req; /* Time of last RTT request */ unsigned int rtt_count; /* Number of samples we've got */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rto_j; /* Retransmission timeout in jiffies */ u8 backoff; /* Backoff timeout */ u8 cong_cwnd; /* Congestion window size */ }; /* * Keys for matching a connection. */ struct rxrpc_conn_proto { union { struct { u32 epoch; /* epoch of this connection */ u32 cid; /* connection ID */ }; u64 index_key; }; }; struct rxrpc_conn_parameters { struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ u16 service_id; /* Service ID for this connection */ u32 security_level; /* Security level selected */ }; /* * Bits in the connection flags. */ enum rxrpc_conn_flag { RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */ RXRPC_CONN_FINAL_ACK_0, /* Need final ACK for channel 0 */ RXRPC_CONN_FINAL_ACK_1, /* Need final ACK for channel 1 */ RXRPC_CONN_FINAL_ACK_2, /* Need final ACK for channel 2 */ RXRPC_CONN_FINAL_ACK_3, /* Need final ACK for channel 3 */ }; #define RXRPC_CONN_FINAL_ACK_MASK ((1UL << RXRPC_CONN_FINAL_ACK_0) | \ (1UL << RXRPC_CONN_FINAL_ACK_1) | \ (1UL << RXRPC_CONN_FINAL_ACK_2) | \ (1UL << RXRPC_CONN_FINAL_ACK_3)) /* * Events that can be raised upon a connection. */ enum rxrpc_conn_event { RXRPC_CONN_EV_CHALLENGE, /* Send challenge packet */ }; /* * The connection protocol state. */ enum rxrpc_conn_proto_state { RXRPC_CONN_UNUSED, /* Connection not yet attempted */ RXRPC_CONN_CLIENT, /* Client connection */ RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */ RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ RXRPC_CONN_SERVICE, /* Service secured connection */ RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */ RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */ RXRPC_CONN__NR_STATES }; /* * RxRPC client connection bundle. */ struct rxrpc_bundle { struct rxrpc_conn_parameters params; refcount_t ref; atomic_t active; /* Number of active users */ unsigned int debug_id; bool try_upgrade; /* True if the bundle is attempting upgrade */ bool alloc_conn; /* True if someone's getting a conn */ short alloc_error; /* Error from last conn allocation */ spinlock_t channel_lock; struct rb_node local_node; /* Node in local->client_conns */ struct list_head waiting_calls; /* Calls waiting for channels */ unsigned long avail_chans; /* Mask of available channels */ struct rxrpc_connection *conns[4]; /* The connections in the bundle (max 4) */ }; /* * RxRPC connection definition * - matched by { local, peer, epoch, conn_id, direction } * - each connection can only handle four simultaneous calls */ struct rxrpc_connection { struct rxrpc_conn_proto proto; struct rxrpc_conn_parameters params; refcount_t ref; struct rcu_head rcu; struct list_head cache_link; unsigned char act_chans; /* Mask of active channels */ struct rxrpc_channel { unsigned long final_ack_at; /* Time at which to issue final ACK */ struct rxrpc_call __rcu *call; /* Active call */ unsigned int call_debug_id; /* call->debug_id */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ u32 last_call; /* ID of last call */ u8 last_type; /* Type of last packet */ union { u32 last_seq; u32 last_abort; }; } channels[RXRPC_MAXCALLS]; struct timer_list timer; /* Conn event timer */ struct work_struct processor; /* connection event processor */ struct rxrpc_bundle *bundle; /* Client connection bundle */ struct rb_node service_node; /* Node in peer->service_conns */ struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ const struct rxrpc_security *security; /* applied security module */ union { struct { struct crypto_sync_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ u32 nonce; /* response re-use preventer */ } rxkad; }; unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ spinlock_t state_lock; /* state-change lock */ enum rxrpc_conn_proto_state state; /* current state of connection */ u32 abort_code; /* Abort code of connection abort */ int debug_id; /* debug ID for printks */ atomic_t serial; /* packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ u32 service_id; /* Service ID, possibly upgraded */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ u8 bundle_shift; /* Index into bundle->avail_chans */ short error; /* Local error code */ }; static inline bool rxrpc_to_server(const struct rxrpc_skb_priv *sp) { return sp->hdr.flags & RXRPC_CLIENT_INITIATED; } static inline bool rxrpc_to_client(const struct rxrpc_skb_priv *sp) { return !rxrpc_to_server(sp); } /* * Flags in call->flags. */ enum rxrpc_call_flag { RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */ RXRPC_CALL_HAS_USERID, /* has a user ID attached */ RXRPC_CALL_IS_SERVICE, /* Call is service call */ RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ RXRPC_CALL_KERNEL, /* The call was made by the kernel */ RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */ }; /* * Events that can be raised on a call. */ enum rxrpc_call_event { RXRPC_CALL_EV_ACK, /* need to generate ACK */ RXRPC_CALL_EV_ABORT, /* need to generate abort */ RXRPC_CALL_EV_RESEND, /* Tx resend required */ RXRPC_CALL_EV_PING, /* Ping send required */ RXRPC_CALL_EV_EXPIRED, /* Expiry occurred */ RXRPC_CALL_EV_ACK_LOST, /* ACK may be lost, send ping */ }; /* * The states that a call can be in. */ enum rxrpc_call_state { RXRPC_CALL_UNINITIALISED, RXRPC_CALL_CLIENT_AWAIT_CONN, /* - client waiting for connection to become available */ RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ RXRPC_CALL_COMPLETE, /* - call complete */ NR__RXRPC_CALL_STATES }; /* * Call completion condition (state == RXRPC_CALL_COMPLETE). */ enum rxrpc_call_completion { RXRPC_CALL_SUCCEEDED, /* - Normal termination */ RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ NR__RXRPC_CALL_COMPLETIONS }; /* * Call Tx congestion management modes. */ enum rxrpc_congest_mode { RXRPC_CALL_SLOW_START, RXRPC_CALL_CONGEST_AVOIDANCE, RXRPC_CALL_PACKET_LOSS, RXRPC_CALL_FAST_RETRANSMIT, NR__RXRPC_CONGEST_MODES }; /* * RxRPC call definition * - matched by { connection, call_id } */ struct rxrpc_call { struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ unsigned long ack_at; /* When deferred ACK needs to happen */ unsigned long ack_lost_at; /* When ACK is figured as lost */ unsigned long resend_at; /* When next resend needs to happen */ unsigned long ping_at; /* When next to send a ping */ unsigned long keepalive_at; /* When next to send a keepalive ping */ unsigned long expect_rx_by; /* When we expect to get a packet by */ unsigned long expect_req_by; /* When we expect to get a request DATA packet by */ unsigned long expect_term_by; /* When we expect call termination by */ u32 next_rx_timo; /* Timeout for next Rx packet (jif) */ u32 next_req_timo; /* Timeout for next Rx request packet (jif) */ struct skcipher_request *cipher_req; /* Packet cipher request buffer */ struct timer_list timer; /* Combined event timer */ struct work_struct processor; /* Event processor */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ struct list_head chan_wait_link; /* Link in conn->bundle->waiting_calls */ struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* Link in rx->acceptq */ struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ struct list_head sock_link; /* Link in rx->sock_calls */ struct rb_node sock_node; /* Node in rx->calls */ struct sk_buff *tx_pending; /* Tx socket buffer being filled */ wait_queue_head_t waitq; /* Wait queue for channel or Tx */ s64 tx_total_len; /* Total length left to be transmitted (or -1) */ __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ unsigned long user_call_ID; /* user-defined call ID */ unsigned long flags; unsigned long events; spinlock_t lock; spinlock_t notify_lock; /* Kernel notification lock */ rwlock_t state_lock; /* lock for state transition */ u32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ enum rxrpc_call_state state; /* current state of call */ enum rxrpc_call_completion completion; /* Call completion condition */ refcount_t ref; u16 service_id; /* service ID */ u8 security_ix; /* Security type */ enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ bool rx_pkt_last; /* Current recvmsg packet is last */ /* Rx/Tx circular buffer, depending on phase. * * In the Rx phase, packets are annotated with 0 or the number of the * segment of a jumbo packet each buffer refers to. There can be up to * 47 segments in a maximum-size UDP packet. * * In the Tx phase, packets are annotated with which buffers have been * acked. */ #define RXRPC_RXTX_BUFF_SIZE 64 #define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1) #define RXRPC_INIT_RX_WINDOW_SIZE 63 struct sk_buff **rxtx_buffer; u8 *rxtx_annotations; #define RXRPC_TX_ANNO_ACK 0 #define RXRPC_TX_ANNO_UNACK 1 #define RXRPC_TX_ANNO_NAK 2 #define RXRPC_TX_ANNO_RETRANS 3 #define RXRPC_TX_ANNO_MASK 0x03 #define RXRPC_TX_ANNO_LAST 0x04 #define RXRPC_TX_ANNO_RESENT 0x08 #define RXRPC_RX_ANNO_SUBPACKET 0x3f /* Subpacket number in jumbogram */ #define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */ rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but * not hard-ACK'd packet follows this. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ u16 tx_backoff; /* Delay to insert due to Tx failure */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA * packets) rather than bytes. */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN u8 cong_cwnd; /* Congestion window size */ u8 cong_extra; /* Extra to send for congestion management */ u8 cong_ssthresh; /* Slow-start threshold */ enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ u8 cong_dup_acks; /* Count of ACKs showing missing packets */ u8 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not * consumed packet follows this. */ rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */ rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */ rxrpc_serial_t rx_serial; /* Highest serial received for this call */ u8 rx_winsize; /* Size of Rx window */ u8 tx_winsize; /* Maximum size of Tx window */ bool tx_phase; /* T if transmission phase, F if receive phase */ u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */ spinlock_t input_lock; /* Lock for packet input to this call */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ rxrpc_seq_t ackr_highest_seq; /* Higest sequence number received */ atomic_t ackr_nr_unacked; /* Number of unacked packets */ atomic_t ackr_nr_consumed; /* Number of packets needing hard ACK */ /* RTT management */ rxrpc_serial_t rtt_serial[4]; /* Serial number of DATA or PING sent */ ktime_t rtt_sent_at[4]; /* Time packet sent */ unsigned long rtt_avail; /* Mask of available slots in bits 0-3, * Mask of pending samples in 8-11 */ #define RXRPC_CALL_RTT_AVAIL_MASK 0xf #define RXRPC_CALL_RTT_PEND_SHIFT 8 /* Transmission-phase ACK management (ACKs we've received). */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ rxrpc_seq_t acks_first_seq; /* first sequence number received */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ }; /* * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { u8 ack_reason; u8 nr_acks; /* Number of ACKs in packet */ u8 nr_nacks; /* Number of NACKs in packet */ u8 nr_new_acks; /* Number of new ACKs in packet */ u8 nr_new_nacks; /* Number of new NACKs in packet */ u8 nr_rot_new_acks; /* Number of rotated new ACKs */ bool new_low_nack; /* T if new low NACK found */ bool retrans_timeo; /* T if reTx due to timeout happened */ u8 flight_size; /* Number of unreceived transmissions */ /* Place to stash values for tracing */ enum rxrpc_congest_mode mode:8; u8 cwnd; u8 ssthresh; u8 dup_acks; u8 cumulative_acks; }; /* * sendmsg() cmsg-specified parameters. */ enum rxrpc_command { RXRPC_CMD_SEND_DATA, /* send data message */ RXRPC_CMD_SEND_ABORT, /* request abort generation */ RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ RXRPC_CMD_CHARGE_ACCEPT, /* [server] charge accept preallocation */ }; struct rxrpc_call_params { s64 tx_total_len; /* Total Tx data length (if send data) */ unsigned long user_call_ID; /* User's call ID */ struct { u32 hard; /* Maximum lifetime (sec) */ u32 idle; /* Max time since last data packet (msec) */ u32 normal; /* Max time since last call packet (msec) */ } timeouts; u8 nr_timeouts; /* Number of timeouts specified */ bool kernel; /* T if kernel is making the call */ enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */ }; struct rxrpc_send_params { struct rxrpc_call_params call; u32 abort_code; /* Abort code to Tx (if abort) */ enum rxrpc_command command : 8; /* The command to implement */ bool exclusive; /* Shared or exclusive call */ bool upgrade; /* If the connection is upgradeable */ }; #include <trace/events/rxrpc.h> /* * af_rxrpc.c */ extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; extern struct workqueue_struct *rxrpc_workqueue; /* * call_accept.c */ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_sock *, struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); /* * call_event.c */ void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool, bool, enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); void rxrpc_reduce_call_timer(struct rxrpc_call *call, unsigned long expire_at, unsigned long now, enum rxrpc_timer_trace why); void rxrpc_delete_call_timer(struct rxrpc_call *call); /* * call_object.c */ extern const char *const rxrpc_call_states[]; extern const char *const rxrpc_call_completions[]; extern unsigned int rxrpc_max_call_lifetime; extern struct kmem_cache *rxrpc_call_jar; struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, struct rxrpc_call_params *, gfp_t, unsigned int); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); bool __rxrpc_queue_call(struct rxrpc_call *); bool rxrpc_queue_call(struct rxrpc_call *); void rxrpc_see_call(struct rxrpc_call *); bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_cleanup_call(struct rxrpc_call *); void rxrpc_destroy_all_calls(struct rxrpc_net *); static inline bool rxrpc_is_service_call(const struct rxrpc_call *call) { return test_bit(RXRPC_CALL_IS_SERVICE, &call->flags); } static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) { return !rxrpc_is_service_call(call); } /* * conn_client.c */ extern unsigned int rxrpc_reap_client_connections; extern unsigned long rxrpc_conn_idle_client_expiry; extern unsigned long rxrpc_conn_idle_client_fast_expiry; extern struct idr rxrpc_client_conn_ids; void rxrpc_destroy_client_conn_ids(void); struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *); void rxrpc_put_bundle(struct rxrpc_bundle *); int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, gfp_t); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); void rxrpc_put_client_conn(struct rxrpc_connection *); void rxrpc_discard_expired_client_conns(struct work_struct *); void rxrpc_destroy_all_client_connections(struct rxrpc_net *); void rxrpc_clean_up_local_conns(struct rxrpc_local *); /* * conn_event.c */ void rxrpc_process_connection(struct work_struct *); void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); /* * conn_object.c */ extern unsigned int rxrpc_connection_expiry; extern unsigned int rxrpc_closed_conn_expiry; struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, struct sk_buff *, struct rxrpc_peer **); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_connection(struct rxrpc_connection *); bool rxrpc_queue_conn(struct rxrpc_connection *); void rxrpc_see_connection(struct rxrpc_connection *); struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *); struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *); void rxrpc_put_service_conn(struct rxrpc_connection *); void rxrpc_service_connection_reaper(struct work_struct *); void rxrpc_destroy_all_connections(struct rxrpc_net *); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) { return conn->out_clientflag; } static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) { return !rxrpc_conn_is_client(conn); } static inline void rxrpc_put_connection(struct rxrpc_connection *conn) { if (!conn) return; if (rxrpc_conn_is_client(conn)) rxrpc_put_client_conn(conn); else rxrpc_put_service_conn(conn); } static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, unsigned long expire_at) { timer_reduce(&conn->timer, expire_at); } /* * conn_service.c */ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *, const struct rxrpc_security *, struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ int rxrpc_input_packet(struct sock *, struct sk_buff *); /* * insecure.c */ extern const struct rxrpc_security rxrpc_no_security; /* * key.c */ extern struct key_type key_type_rxrpc; int rxrpc_request_key(struct rxrpc_sock *, sockptr_t , int); int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, u32); /* * local_event.c */ extern void rxrpc_process_local_events(struct rxrpc_local *); /* * local_object.c */ struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *); struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *); struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *); void rxrpc_put_local(struct rxrpc_local *); struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *); void rxrpc_unuse_local(struct rxrpc_local *); void rxrpc_queue_local(struct rxrpc_local *); void rxrpc_destroy_all_locals(struct rxrpc_net *); static inline bool __rxrpc_unuse_local(struct rxrpc_local *local) { return atomic_dec_return(&local->active_users) == 0; } static inline bool __rxrpc_use_local(struct rxrpc_local *local) { return atomic_fetch_add_unless(&local->active_users, 1, 0) != 0; } /* * misc.c */ extern unsigned int rxrpc_max_backlog __read_mostly; extern unsigned long rxrpc_requested_ack_delay; extern unsigned long rxrpc_soft_ack_delay; extern unsigned long rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; extern const s8 rxrpc_ack_priority[]; /* * net_ns.c */ extern unsigned int rxrpc_net_id; extern struct pernet_operations rxrpc_net_ops; static inline struct rxrpc_net *rxrpc_net(struct net *net) { return net_generic(net, rxrpc_net_id); } /* * output.c */ int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); void rxrpc_reject_packets(struct rxrpc_local *); void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c */ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); void rxrpc_error_report(struct sock *); void rxrpc_peer_keepalive_worker(struct work_struct *); /* * peer_object.c */ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, const struct sockaddr_rxrpc *); struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *, struct rxrpc_local *, struct sockaddr_rxrpc *, gfp_t); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); void rxrpc_new_incoming_peer(struct rxrpc_sock *, struct rxrpc_local *, struct rxrpc_peer *); void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); void rxrpc_put_peer(struct rxrpc_peer *); void rxrpc_put_peer_locked(struct rxrpc_peer *); /* * proc.c */ extern const struct seq_operations rxrpc_call_seq_ops; extern const struct seq_operations rxrpc_connection_seq_ops; extern const struct seq_operations rxrpc_peer_seq_ops; extern const struct seq_operations rxrpc_local_seq_ops; /* * recvmsg.c */ void rxrpc_notify_socket(struct rxrpc_call *); bool __rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); bool rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); bool __rxrpc_call_completed(struct rxrpc_call *); bool rxrpc_call_completed(struct rxrpc_call *); bool __rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); bool rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* * Abort a call due to a protocol error. */ static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, struct sk_buff *skb, const char *eproto_why, const char *why, u32 abort_code) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); trace_rxrpc_rx_eproto(call, sp->hdr.serial, eproto_why); return rxrpc_abort_call(why, call, sp->hdr.seq, abort_code, -EPROTO); } #define rxrpc_abort_eproto(call, skb, eproto_why, abort_why, abort_code) \ __rxrpc_abort_eproto((call), (skb), tracepoint_string(eproto_why), \ (abort_why), (abort_code)) /* * rtt.c */ void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool); void rxrpc_peer_init_rtt(struct rxrpc_peer *); /* * rxkad.c */ #ifdef CONFIG_RXKAD extern const struct rxrpc_security rxkad; #endif /* * security.c */ int __init rxrpc_init_security(void); const struct rxrpc_security *rxrpc_security_lookup(u8); void rxrpc_exit_security(void); int rxrpc_init_client_conn_security(struct rxrpc_connection *); const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *, struct sk_buff *); struct key *rxrpc_look_up_server_security(struct rxrpc_connection *, struct sk_buff *, u32, u32); /* * sendmsg.c */ int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* * server_key.c */ extern struct key_type key_type_rxrpc_s; int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); /* * skbuff.c */ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_packet_destructor(struct sk_buff *); void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_eaten_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); /* * sysctl.c */ #ifdef CONFIG_SYSCTL extern int __init rxrpc_sysctl_init(void); extern void rxrpc_sysctl_exit(void); #else static inline int __init rxrpc_sysctl_init(void) { return 0; } static inline void rxrpc_sysctl_exit(void) {} #endif /* * utils.c */ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); static inline bool before(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) < 0; } static inline bool before_eq(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) <= 0; } static inline bool after(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) > 0; } static inline bool after_eq(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) >= 0; } /* * debug tracing */ extern unsigned int rxrpc_debug; #define dbgprintk(FMT,...) \ printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) #define kproto(FMT,...) dbgprintk("### "FMT ,##__VA_ARGS__) #define knet(FMT,...) dbgprintk("@@@ "FMT ,##__VA_ARGS__) #if defined(__KDEBUG) #define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) #define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) #define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) #define _proto(FMT,...) kproto(FMT,##__VA_ARGS__) #define _net(FMT,...) knet(FMT,##__VA_ARGS__) #elif defined(CONFIG_AF_RXRPC_DEBUG) #define RXRPC_DEBUG_KENTER 0x01 #define RXRPC_DEBUG_KLEAVE 0x02 #define RXRPC_DEBUG_KDEBUG 0x04 #define RXRPC_DEBUG_KPROTO 0x08 #define RXRPC_DEBUG_KNET 0x10 #define _enter(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KENTER)) \ kenter(FMT,##__VA_ARGS__); \ } while (0) #define _leave(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KLEAVE)) \ kleave(FMT,##__VA_ARGS__); \ } while (0) #define _debug(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KDEBUG)) \ kdebug(FMT,##__VA_ARGS__); \ } while (0) #define _proto(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KPROTO)) \ kproto(FMT,##__VA_ARGS__); \ } while (0) #define _net(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KNET)) \ knet(FMT,##__VA_ARGS__); \ } while (0) #else #define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) #define _proto(FMT,...) no_printk("### "FMT ,##__VA_ARGS__) #define _net(FMT,...) no_printk("@@@ "FMT ,##__VA_ARGS__) #endif /* * debug assertion checking */ #if 1 // defined(__KDEBUGALL) #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTCMP(X, OP, Y) \ do { \ __typeof__(X) _x = (X); \ __typeof__(Y) _y = (__typeof__(X))(Y); \ if (unlikely(!(_x OP _y))) { \ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ (unsigned long)_x, (unsigned long)_x, #OP, \ (unsigned long)_y, (unsigned long)_y); \ BUG(); \ } \ } while (0) #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ __typeof__(X) _x = (X); \ __typeof__(Y) _y = (__typeof__(X))(Y); \ if (unlikely((C) && !(_x OP _y))) { \ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ (unsigned long)_x, (unsigned long)_x, #OP, \ (unsigned long)_y, (unsigned long)_y); \ BUG(); \ } \ } while (0) #else #define ASSERT(X) \ do { \ } while (0) #define ASSERTCMP(X, OP, Y) \ do { \ } while (0) #define ASSERTIF(C, X) \ do { \ } while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ } while (0) #endif /* __KDEBUGALL */
2628 2627 339 339 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 // SPDX-License-Identifier: GPL-2.0-only /* * Lock-less NULL terminated single linked list * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the * list can NOT be used in NMI handlers. So code that uses the list in * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2010,2011 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/llist.h> /** * llist_add_batch - add several linked entries in batch * @new_first: first entry in batch to be added * @new_last: last entry in batch to be added * @head: the head for your lock-less list * * Return whether list is empty before adding. */ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_head *head) { struct llist_node *first; do { new_last->next = first = READ_ONCE(head->first); } while (cmpxchg(&head->first, first, new_first) != first); return !first; } EXPORT_SYMBOL_GPL(llist_add_batch); /** * llist_del_first - delete the first entry of lock-less list * @head: the head for your lock-less list * * If list is empty, return NULL, otherwise, return the first entry * deleted, this is the newest added one. * * Only one llist_del_first user can be used simultaneously with * multiple llist_add users without lock. Because otherwise * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, * llist_add) sequence in another user may change @head->first->next, * but keep @head->first. If multiple consumers are needed, please * use llist_del_all or use lock between consumers. */ struct llist_node *llist_del_first(struct llist_head *head) { struct llist_node *entry, *old_entry, *next; entry = smp_load_acquire(&head->first); for (;;) { if (entry == NULL) return NULL; old_entry = entry; next = READ_ONCE(entry->next); entry = cmpxchg(&head->first, old_entry, next); if (entry == old_entry) break; } return entry; } EXPORT_SYMBOL_GPL(llist_del_first); /** * llist_reverse_order - reverse order of a llist chain * @head: first item of the list to be reversed * * Reverse the order of a chain of llist entries and return the * new first entry. */ struct llist_node *llist_reverse_order(struct llist_node *head) { struct llist_node *new_head = NULL; while (head) { struct llist_node *tmp = head; head = head->next; tmp->next = new_head; new_head = tmp; } return new_head; } EXPORT_SYMBOL_GPL(llist_reverse_order);
24 11 42 41 42 42 33 33 33 9 21 21 17 41 25 21 21 17 21 12 12 9 9 9 9 9 9 34 34 30 12 30 25 25 12 25 1 1 33 33 33 33 24 64 44 64 64 64 64 64 64 62 62 62 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 // SPDX-License-Identifier: GPL-2.0-or-later /* * Forwarding database * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/times.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/atomic.h> #include <asm/unaligned.h> #include <linux/if_vlan.h> #include <net/switchdev.h> #include <trace/events/bridge.h> #include "br_private.h" static const struct rhashtable_params br_fdb_rht_params = { .head_offset = offsetof(struct net_bridge_fdb_entry, rhnode), .key_offset = offsetof(struct net_bridge_fdb_entry, key), .key_len = sizeof(struct net_bridge_fdb_key), .automatic_shrinking = true, }; static struct kmem_cache *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *, int, bool); int __init br_fdb_init(void) { br_fdb_cache = kmem_cache_create("bridge_fdb_cache", sizeof(struct net_bridge_fdb_entry), 0, SLAB_HWCACHE_ALIGN, NULL); if (!br_fdb_cache) return -ENOMEM; return 0; } void br_fdb_fini(void) { kmem_cache_destroy(br_fdb_cache); } int br_fdb_hash_init(struct net_bridge *br) { return rhashtable_init(&br->fdb_hash_tbl, &br_fdb_rht_params); } void br_fdb_hash_fini(struct net_bridge *br) { rhashtable_destroy(&br->fdb_hash_tbl); } /* if topology_changing then use forward_delay (default 15 sec) * otherwise keep longer (default 5 minutes) */ static inline unsigned long hold_time(const struct net_bridge *br) { return br->topology_change ? br->forward_delay : br->ageing_time; } static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { return !test_bit(BR_FDB_STATIC, &fdb->flags) && !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) && time_before_eq(fdb->updated + hold_time(br), jiffies); } static void fdb_rcu_free(struct rcu_head *head) { struct net_bridge_fdb_entry *ent = container_of(head, struct net_bridge_fdb_entry, rcu); kmem_cache_free(br_fdb_cache, ent); } static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl, const unsigned char *addr, __u16 vid) { struct net_bridge_fdb_key key; WARN_ON_ONCE(!rcu_read_lock_held()); key.vlan_id = vid; memcpy(key.addr.addr, addr, sizeof(key.addr.addr)); return rhashtable_lookup(tbl, &key, br_fdb_rht_params); } /* requires bridge hash_lock */ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br, const unsigned char *addr, __u16 vid) { struct net_bridge_fdb_entry *fdb; lockdep_assert_held_once(&br->hash_lock); rcu_read_lock(); fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); rcu_read_unlock(); return fdb; } struct net_device *br_fdb_find_port(const struct net_device *br_dev, const unsigned char *addr, __u16 vid) { struct net_bridge_fdb_entry *f; struct net_device *dev = NULL; struct net_bridge *br; ASSERT_RTNL(); if (!netif_is_bridge_master(br_dev)) return NULL; br = netdev_priv(br_dev); rcu_read_lock(); f = br_fdb_find_rcu(br, addr, vid); if (f && f->dst) dev = f->dst->dev; rcu_read_unlock(); return dev; } EXPORT_SYMBOL_GPL(br_fdb_find_port); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, const unsigned char *addr, __u16 vid) { return fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); } /* When a static FDB entry is added, the mac address from the entry is * added to the bridge private HW address list and all required ports * are then updated with the new information. * Called under RTNL. */ static void fdb_add_hw_addr(struct net_bridge *br, const unsigned char *addr) { int err; struct net_bridge_port *p; ASSERT_RTNL(); list_for_each_entry(p, &br->port_list, list) { if (!br_promisc_port(p)) { err = dev_uc_add(p->dev, addr); if (err) goto undo; } } return; undo: list_for_each_entry_continue_reverse(p, &br->port_list, list) { if (!br_promisc_port(p)) dev_uc_del(p->dev, addr); } } /* When a static FDB entry is deleted, the HW address from that entry is * also removed from the bridge private HW address list and updates all * the ports with needed information. * Called under RTNL. */ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr) { struct net_bridge_port *p; ASSERT_RTNL(); list_for_each_entry(p, &br->port_list, list) { if (!br_promisc_port(p)) dev_uc_del(p->dev, addr); } } static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f, bool swdev_notify) { trace_fdb_delete(br, f); if (test_bit(BR_FDB_STATIC, &f->flags)) fdb_del_hw_addr(br, f->key.addr.addr); hlist_del_init_rcu(&f->fdb_node); rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode, br_fdb_rht_params); fdb_notify(br, f, RTM_DELNEIGH, swdev_notify); call_rcu(&f->rcu, fdb_rcu_free); } /* Delete a local entry if no other port had the same address. */ static void fdb_delete_local(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_fdb_entry *f) { const unsigned char *addr = f->key.addr.addr; struct net_bridge_vlan_group *vg; const struct net_bridge_vlan *v; struct net_bridge_port *op; u16 vid = f->key.vlan_id; /* Maybe another port has same hw addr? */ list_for_each_entry(op, &br->port_list, list) { vg = nbp_vlan_group(op); if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && (!vid || br_vlan_find(vg, vid))) { f->dst = op; clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } } vg = br_vlan_group(br); v = br_vlan_find(vg, vid); /* Maybe bridge device has same hw addr? */ if (p && ether_addr_equal(br->dev->dev_addr, addr) && (!vid || (v && br_vlan_should_use(v)))) { f->dst = NULL; clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } fdb_delete(br, f, true); } void br_fdb_find_delete_local(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid) { struct net_bridge_fdb_entry *f; spin_lock_bh(&br->hash_lock); f = br_fdb_find(br, addr, vid); if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p) fdb_delete_local(br, p, f); spin_unlock_bh(&br->hash_lock); } void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; struct net_bridge *br = p->br; struct net_bridge_vlan *v; spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); hlist_for_each_entry(f, &br->fdb_list, fdb_node) { if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) { /* delete old one */ fdb_delete_local(br, p, f); /* if this port has no vlan information * configured, we can safely be done at * this point. */ if (!vg || !vg->num_vlans) goto insert; } } insert: /* insert new address, may fail if invalid address or dup. */ fdb_insert(br, p, newaddr, 0); if (!vg || !vg->num_vlans) goto done; /* Now add entries for every VLAN configured on the port. * This function runs under RTNL so the bitmap will not change * from under us. */ list_for_each_entry(v, &vg->vlan_list, vlist) fdb_insert(br, p, newaddr, v->vid); done: spin_unlock_bh(&br->hash_lock); } void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) { struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; struct net_bridge_vlan *v; spin_lock_bh(&br->hash_lock); /* If old entry was unassociated with any port, then delete it. */ f = br_fdb_find(br, br->dev->dev_addr, 0); if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); vg = br_vlan_group(br); if (!vg || !vg->num_vlans) goto out; /* Now remove and add entries for every VLAN configured on the * bridge. This function runs under RTNL so the bitmap will not * change from under us. */ list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; f = br_fdb_find(br, br->dev->dev_addr, v->vid); if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, v->vid); } out: spin_unlock_bh(&br->hash_lock); } void br_fdb_cleanup(struct work_struct *work) { struct net_bridge *br = container_of(work, struct net_bridge, gc_work.work); struct net_bridge_fdb_entry *f = NULL; unsigned long delay = hold_time(br); unsigned long work_delay = delay; unsigned long now = jiffies; /* this part is tricky, in order to avoid blocking learning and * consequently forwarding, we rely on rcu to delete objects with * delayed freeing allowing us to continue traversing */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { unsigned long this_timer = f->updated + delay; if (test_bit(BR_FDB_STATIC, &f->flags) || test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) { if (test_bit(BR_FDB_NOTIFY, &f->flags)) { if (time_after(this_timer, now)) work_delay = min(work_delay, this_timer - now); else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &f->flags)) fdb_notify(br, f, RTM_NEWNEIGH, false); } continue; } if (time_after(this_timer, now)) { work_delay = min(work_delay, this_timer - now); } else { spin_lock_bh(&br->hash_lock); if (!hlist_unhashed(&f->fdb_node)) fdb_delete(br, f, true); spin_unlock_bh(&br->hash_lock); } } rcu_read_unlock(); /* Cleanup minimum 10 milliseconds apart */ work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10)); mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } /* Completely flush all dynamic entries in forwarding database.*/ void br_fdb_flush(struct net_bridge *br) { struct net_bridge_fdb_entry *f; struct hlist_node *tmp; spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { if (!test_bit(BR_FDB_STATIC, &f->flags)) fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); } /* Flush all entries referring to a specific port. * if do_all is set also flush static entries * if vid is set delete all entries that match the vlan_id */ void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all) { struct net_bridge_fdb_entry *f; struct hlist_node *tmp; spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { if (f->dst != p) continue; if (!do_all) if (test_bit(BR_FDB_STATIC, &f->flags) || (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags)) || (vid && f->key.vlan_id != vid)) continue; if (test_bit(BR_FDB_LOCAL, &f->flags)) fdb_delete_local(br, p, f); else fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); } #if IS_ENABLED(CONFIG_ATM_LANE) /* Interface used by ATM LANE hook to test * if an addr is on some other bridge port */ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) { struct net_bridge_fdb_entry *fdb; struct net_bridge_port *port; int ret; rcu_read_lock(); port = br_port_get_rcu(dev); if (!port) ret = 0; else { const struct net_bridge_port *dst = NULL; fdb = br_fdb_find_rcu(port->br, addr, 0); if (fdb) dst = READ_ONCE(fdb->dst); ret = dst && dst->dev != dev && dst->state == BR_STATE_FORWARDING; } rcu_read_unlock(); return ret; } #endif /* CONFIG_ATM_LANE */ /* * Fill buffer with forwarding table records in * the API format. */ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long maxnum, unsigned long skip) { struct net_bridge_fdb_entry *f; struct __fdb_entry *fe = buf; int num = 0; memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { if (num >= maxnum) break; if (has_expired(br, f)) continue; /* ignore pseudo entry for local MAC address */ if (!f->dst) continue; if (skip) { --skip; continue; } /* convert from internal format to API */ memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN); /* due to ABI compat need to split into hi/lo */ fe->port_no = f->dst->port_no; fe->port_hi = f->dst->port_no >> 8; fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags); if (!test_bit(BR_FDB_STATIC, &f->flags)) fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); ++fe; ++num; } rcu_read_unlock(); return num; } static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, __u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); if (fdb) { memcpy(fdb->key.addr.addr, addr, ETH_ALEN); WRITE_ONCE(fdb->dst, source); fdb->key.vlan_id = vid; fdb->flags = flags; fdb->updated = fdb->used = jiffies; if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode, br_fdb_rht_params)) { kmem_cache_free(br_fdb_cache, fdb); fdb = NULL; } else { hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list); } } return fdb; } static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid) { struct net_bridge_fdb_entry *fdb; if (!is_valid_ether_addr(addr)) return -EINVAL; fdb = br_fdb_find(br, addr, vid); if (fdb) { /* it is okay to have multiple ports with same * address, just use the first one. */ if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return 0; br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", source ? source->dev->name : br->dev->name, addr, vid); fdb_delete(br, fdb, true); } fdb = fdb_create(br, source, addr, vid, BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC)); if (!fdb) return -ENOMEM; fdb_add_hw_addr(br, addr); fdb_notify(br, fdb, RTM_NEWNEIGH, true); return 0; } /* Add entry for local address of interface */ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid) { int ret; spin_lock_bh(&br->hash_lock); ret = fdb_insert(br, source, addr, vid); spin_unlock_bh(&br->hash_lock); return ret; } /* returns true if the fdb was modified */ static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb) { return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) && test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)); } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; /* some users want to always flood. */ if (hold_time(br) == 0) return; fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) { if (net_ratelimit()) br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n", source->dev->name, addr, vid); } else { unsigned long now = jiffies; bool fdb_modified = false; if (now != fdb->updated) { fdb->updated = now; fdb_modified = __fdb_mark_active(fdb); } /* fastpath: update of existing entry */ if (unlikely(source != READ_ONCE(fdb->dst) && !test_bit(BR_FDB_STICKY, &fdb->flags))) { br_switchdev_fdb_notify(br, fdb, RTM_DELNEIGH); WRITE_ONCE(fdb->dst, source); fdb_modified = true; /* Take over HW learned entry */ if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))) clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); } if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } } } else { spin_lock(&br->hash_lock); fdb = fdb_create(br, source, addr, vid, flags); if (fdb) { trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } /* else we lose race and someone else inserts * it first, don't bother updating */ spin_unlock(&br->hash_lock); } } static int fdb_to_nud(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return NUD_PERMANENT; else if (test_bit(BR_FDB_STATIC, &fdb->flags)) return NUD_NOARP; else if (has_expired(br, fdb)) return NUD_STALE; else return NUD_REACHABLE; } static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, u32 portid, u32 seq, int type, unsigned int flags) { const struct net_bridge_port *dst = READ_ONCE(fdb->dst); unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = 0; ndm->ndm_type = 0; ndm->ndm_ifindex = dst ? dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(br, fdb); if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) ndm->ndm_flags |= NTF_OFFLOADED; if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) ndm->ndm_flags |= NTF_EXT_LEARNED; if (test_bit(BR_FDB_STICKY, &fdb->flags)) ndm->ndm_flags |= NTF_STICKY; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->key.vlan_id)) goto nla_put_failure; if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); u8 notify_bits = FDB_NOTIFY_BIT; if (!nest) goto nla_put_failure; if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) notify_bits |= FDB_NOTIFY_INACTIVE_BIT; if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { nla_nest_cancel(skb, nest); goto nla_put_failure; } nla_nest_end(skb, nest); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t fdb_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } static int br_fdb_replay_one(struct net_bridge *br, struct notifier_block *nb, const struct net_bridge_fdb_entry *fdb, unsigned long action, const void *ctx) { const struct net_bridge_port *p = READ_ONCE(fdb->dst); struct switchdev_notifier_fdb_info item; int err; item.addr = fdb->key.addr.addr; item.vid = fdb->key.vlan_id; item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); item.is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); item.info.dev = (!p || item.is_local) ? br->dev : p->dev; item.info.ctx = ctx; err = nb->notifier_call(nb, action, &item); return notifier_to_errno(err); } int br_fdb_replay(const struct net_device *br_dev, const void *ctx, bool adding, struct notifier_block *nb) { struct net_bridge_fdb_entry *fdb; struct net_bridge *br; unsigned long action; int err = 0; if (!nb) return 0; if (!netif_is_bridge_master(br_dev)) return -EINVAL; br = netdev_priv(br_dev); if (adding) action = SWITCHDEV_FDB_ADD_TO_DEVICE; else action = SWITCHDEV_FDB_DEL_TO_DEVICE; rcu_read_lock(); hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { err = br_fdb_replay_one(br, nb, fdb, action, ctx); if (err) break; } rcu_read_unlock(); return err; } static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type, bool swdev_notify) { struct net *net = dev_net(br->dev); struct sk_buff *skb; int err = -ENOBUFS; if (swdev_notify) br_switchdev_fdb_notify(br, fdb, type); skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0); if (err < 0) { /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } /* Dump information about entries, in response to GETNEIGH */ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *f; int err = 0; if (!(dev->priv_flags & IFF_EBRIDGE)) return err; if (!filter_dev) { err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); if (err < 0) return err; } rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { if (*idx < cb->args[2]) goto skip; if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { if (filter_dev != dev) goto skip; /* !f->dst is a special case for bridge * It means the MAC belongs to the bridge * Therefore need a little more filtering * we only want to dump the !f->dst case */ if (f->dst) goto skip; } if (!filter_dev && f->dst) goto skip; err = fdb_fill_info(skb, br, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI); if (err < 0) break; skip: *idx += 1; } rcu_read_unlock(); return err; } int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *f; int err = 0; rcu_read_lock(); f = br_fdb_find_rcu(br, addr, vid); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; goto errout; } err = fdb_fill_info(skb, br, f, portid, seq, RTM_NEWNEIGH, 0); errout: rcu_read_unlock(); return err; } /* returns true if the fdb is modified */ static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify) { bool modified = false; /* allow to mark an entry as inactive, usually done on creation */ if ((notify & FDB_NOTIFY_INACTIVE_BIT) && !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) modified = true; if ((notify & FDB_NOTIFY_BIT) && !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) { /* enabled activity tracking */ modified = true; } else if (!(notify & FDB_NOTIFY_BIT) && test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) { /* disabled activity tracking, clear notify state */ clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags); modified = true; } return modified; } /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid, struct nlattr *nfea_tb[]) { bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); bool refresh = !nfea_tb[NFEA_DONT_REFRESH]; struct net_bridge_fdb_entry *fdb; u16 state = ndm->ndm_state; bool modified = false; u8 notify = 0; /* If the port cannot learn allow only local and static entries */ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && !(source->state == BR_STATE_LEARNING || source->state == BR_STATE_FORWARDING)) return -EPERM; if (!source && !(state & NUD_PERMANENT)) { pr_info("bridge: RTM_NEWNEIGH %s without NUD_PERMANENT\n", br->dev->name); return -EINVAL; } if (is_sticky && (state & NUD_PERMANENT)) return -EINVAL; if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) { notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]); if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) || (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT) return -EINVAL; } fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) return -ENOENT; fdb = fdb_create(br, source, addr, vid, 0); if (!fdb) return -ENOMEM; modified = true; } else { if (flags & NLM_F_EXCL) return -EEXIST; if (READ_ONCE(fdb->dst) != source) { WRITE_ONCE(fdb->dst, source); modified = true; } } if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { set_bit(BR_FDB_LOCAL, &fdb->flags); if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); } else if (state & NUD_NOARP) { clear_bit(BR_FDB_LOCAL, &fdb->flags); if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); } else { clear_bit(BR_FDB_LOCAL, &fdb->flags); if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags)) fdb_del_hw_addr(br, addr); } modified = true; } if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) { change_bit(BR_FDB_STICKY, &fdb->flags); modified = true; } if (fdb_handle_notify(fdb, notify)) modified = true; set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; if (modified) { if (refresh) fdb->updated = jiffies; fdb_notify(br, fdb, RTM_NEWNEIGH, true); } return 0; } static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[], struct netlink_ext_ack *extack) { int err = 0; if (ndm->ndm_flags & NTF_USE) { if (!p) { pr_info("bridge: RTM_NEWNEIGH %s with NTF_USE is not supported\n", br->dev->name); return -EINVAL; } if (!nbp_state_should_learn(p)) return 0; local_bh_disable(); rcu_read_lock(); br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER)); rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { if (!p && !(ndm->ndm_state & NUD_PERMANENT)) { NL_SET_ERR_MSG_MOD(extack, "FDB entry towards bridge must be permanent"); return -EINVAL; } err = br_fdb_external_learn_add(br, p, addr, vid, true); } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); spin_unlock_bh(&br->hash_lock); } return err; } static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, [NFEA_DONT_REFRESH] = { .type = NLA_FLAG }, }; /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, struct netlink_ext_ack *extack) { struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; struct net_bridge *br = NULL; int err = 0; trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags); if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) { pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); return -EINVAL; } if (is_zero_ether_addr(addr)) { pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n"); return -EINVAL; } if (dev->priv_flags & IFF_EBRIDGE) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (!p) { pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", dev->name); return -EINVAL; } br = p->br; vg = nbp_vlan_group(p); } if (tb[NDA_FDB_EXT_ATTRS]) { attr = tb[NDA_FDB_EXT_ATTRS]; err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, br_nda_fdb_pol, extack); if (err) return err; } else { memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1)); } if (vid) { v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { pr_info("bridge: RTM_NEWNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } /* VID was specified, so use it. */ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb, extack); } else { err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb, extack); if (err || !vg || !vg->num_vlans) goto out; /* We have vlans configured on this port and user didn't * specify a VLAN. To be nice, add/update entry for every * vlan on this port. */ list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, nfea_tb, extack); if (err) goto out; } } out: return err; } static int fdb_delete_by_addr_and_port(struct net_bridge *br, const struct net_bridge_port *p, const u8 *addr, u16 vlan) { struct net_bridge_fdb_entry *fdb; fdb = br_fdb_find(br, addr, vlan); if (!fdb || READ_ONCE(fdb->dst) != p) return -ENOENT; fdb_delete(br, fdb, true); return 0; } static int __br_fdb_delete(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid) { int err; spin_lock_bh(&br->hash_lock); err = fdb_delete_by_addr_and_port(br, p, addr, vid); spin_unlock_bh(&br->hash_lock); return err; } /* Remove neighbor entry with RTM_DELNEIGH */ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; struct net_bridge *br; int err; if (dev->priv_flags & IFF_EBRIDGE) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (!p) { pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", dev->name); return -EINVAL; } vg = nbp_vlan_group(p); br = p->br; } if (vid) { v = br_vlan_find(vg, vid); if (!v) { pr_info("bridge: RTM_DELNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } err = __br_fdb_delete(br, p, addr, vid); } else { err = -ENOENT; err &= __br_fdb_delete(br, p, addr, 0); if (!vg || !vg->num_vlans) return err; list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; err &= __br_fdb_delete(br, p, addr, v->vid); } } return err; } int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) { struct net_bridge_fdb_entry *f, *tmp; int err = 0; ASSERT_RTNL(); /* the key here is that static entries change only under rtnl */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; err = dev_uc_add(p->dev, f->key.addr.addr); if (err) goto rollback; } done: rcu_read_unlock(); return err; rollback: hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) { /* We only care for static entries */ if (!test_bit(BR_FDB_STATIC, &tmp->flags)) continue; if (tmp == f) break; dev_uc_del(p->dev, tmp->key.addr.addr); } goto done; } void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) { struct net_bridge_fdb_entry *f; ASSERT_RTNL(); rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; dev_uc_del(p->dev, f->key.addr.addr); } rcu_read_unlock(); } int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool swdev_notify) { struct net_bridge_fdb_entry *fdb; bool modified = false; int err = 0; trace_br_fdb_external_learn_add(br, p, addr, vid); spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); if (!fdb) { unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN); if (swdev_notify) flags |= BIT(BR_FDB_ADDED_BY_USER); if (!p) flags |= BIT(BR_FDB_LOCAL); fdb = fdb_create(br, p, addr, vid, flags); if (!fdb) { err = -ENOMEM; goto err_unlock; } fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { fdb->updated = jiffies; if (READ_ONCE(fdb->dst) != p) { WRITE_ONCE(fdb->dst, p); modified = true; } if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ fdb->used = jiffies; } else if (!test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags)) { /* Take over SW learned entry */ set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); modified = true; } if (swdev_notify) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (!p) set_bit(BR_FDB_LOCAL, &fdb->flags); if (modified) fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } err_unlock: spin_unlock_bh(&br->hash_lock); return err; } int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool swdev_notify) { struct net_bridge_fdb_entry *fdb; int err = 0; spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) fdb_delete(br, fdb, swdev_notify); else err = -ENOENT; spin_unlock_bh(&br->hash_lock); return err; } void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool offloaded) { struct net_bridge_fdb_entry *fdb; spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags)) change_bit(BR_FDB_OFFLOADED, &fdb->flags); spin_unlock_bh(&br->hash_lock); } void br_fdb_clear_offload(const struct net_device *dev, u16 vid) { struct net_bridge_fdb_entry *f; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_rtnl(dev); if (!p) return; spin_lock_bh(&p->br->hash_lock); hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { if (f->dst == p && f->key.vlan_id == vid) clear_bit(BR_FDB_OFFLOADED, &f->flags); } spin_unlock_bh(&p->br->hash_lock); } EXPORT_SYMBOL_GPL(br_fdb_clear_offload);
2 2 2 2 2 2 2 2 2 5 5 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/uaccess.h> #include <linux/debugfs.h> #include <linux/caif/caif_socket.h> #include <linux/pkt_sched.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/caif/caif_layer.h> #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_CAIF); /* * CAIF state is re-using the TCP socket states. * caif_states stored in sk_state reflect the state as reported by * the CAIF stack, while sk_socket->state is the state of the socket. */ enum caif_states { CAIF_CONNECTED = TCP_ESTABLISHED, CAIF_CONNECTING = TCP_SYN_SENT, CAIF_DISCONNECTED = TCP_CLOSE }; #define TX_FLOW_ON_BIT 1 #define RX_FLOW_ON_BIT 2 struct caifsock { struct sock sk; /* must be first member */ struct cflayer layer; u32 flow_state; struct caif_connect_request conn_req; struct mutex readlock; struct dentry *debugfs_socket_dir; int headroom, tailroom, maxframe; }; static int rx_flow_is_on(struct caifsock *cf_sk) { return test_bit(RX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static int tx_flow_is_on(struct caifsock *cf_sk) { return test_bit(TX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void set_rx_flow_off(struct caifsock *cf_sk) { clear_bit(RX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void set_rx_flow_on(struct caifsock *cf_sk) { set_bit(RX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void set_tx_flow_off(struct caifsock *cf_sk) { clear_bit(TX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void set_tx_flow_on(struct caifsock *cf_sk) { set_bit(TX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void caif_read_lock(struct sock *sk) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); mutex_lock(&cf_sk->readlock); } static void caif_read_unlock(struct sock *sk) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); mutex_unlock(&cf_sk->readlock); } static int sk_rcvbuf_lowwater(struct caifsock *cf_sk) { /* A quarter of full buffer is used a low water mark */ return cf_sk->sk.sk_rcvbuf / 4; } static void caif_flow_ctrl(struct sock *sk, int mode) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); if (cf_sk->layer.dn && cf_sk->layer.dn->modemcmd) cf_sk->layer.dn->modemcmd(cf_sk->layer.dn, mode); } /* * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are * not dropped, but CAIF is sending flow off instead. */ static void caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); bool queued = false; if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { net_dbg_ratelimited("sending flow OFF (queue len = %d %d)\n", atomic_read(&cf_sk->sk.sk_rmem_alloc), sk_rcvbuf_lowwater(cf_sk)); set_rx_flow_off(cf_sk); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); } err = sk_filter(sk, skb); if (err) goto out; if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); } skb->dev = NULL; skb_set_owner_r(skb, sk); spin_lock_irqsave(&list->lock, flags); queued = !sock_flag(sk, SOCK_DEAD); if (queued) __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); out: if (queued) sk->sk_data_ready(sk); else kfree_skb(skb); } /* Packet Receive Callback function called from CAIF Stack */ static int caif_sktrecv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct caifsock *cf_sk; struct sk_buff *skb; cf_sk = container_of(layr, struct caifsock, layer); skb = cfpkt_tonative(pkt); if (unlikely(cf_sk->sk.sk_state != CAIF_CONNECTED)) { kfree_skb(skb); return 0; } caif_queue_rcv_skb(&cf_sk->sk, skb); return 0; } static void cfsk_hold(struct cflayer *layr) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); sock_hold(&cf_sk->sk); } static void cfsk_put(struct cflayer *layr) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); sock_put(&cf_sk->sk); } /* Packet Control Callback function called from CAIF */ static void caif_ctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); switch (flow) { case CAIF_CTRLCMD_FLOW_ON_IND: /* OK from modem to start sending again */ set_tx_flow_on(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_FLOW_OFF_IND: /* Modem asks us to shut up */ set_tx_flow_off(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_INIT_RSP: /* We're now connected */ caif_client_register_refcnt(&cf_sk->layer, cfsk_hold, cfsk_put); cf_sk->sk.sk_state = CAIF_CONNECTED; set_tx_flow_on(cf_sk); cf_sk->sk.sk_shutdown = 0; cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_DEINIT_RSP: /* We're now disconnected */ cf_sk->sk.sk_state = CAIF_DISCONNECTED; cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_INIT_FAIL_RSP: /* Connect request failed */ cf_sk->sk.sk_err = ECONNREFUSED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; /* * Socket "standards" seems to require POLLOUT to * be set at connect failure. */ set_tx_flow_on(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: /* Modem has closed this connection, or device is down. */ cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; cf_sk->sk.sk_err = ECONNRESET; set_rx_flow_on(cf_sk); sk_error_report(&cf_sk->sk); break; default: pr_debug("Unexpected flow command %d\n", flow); } } static void caif_check_flow_release(struct sock *sk) { struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); if (rx_flow_is_on(cf_sk)) return; if (atomic_read(&sk->sk_rmem_alloc) <= sk_rcvbuf_lowwater(cf_sk)) { set_rx_flow_on(cf_sk); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ); } } /* * Copied from unix_dgram_recvmsg, but removed credit checks, * changed locking, address handling and added MSG_TRUNC. */ static int caif_seqpkt_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int ret; int copylen; ret = -EOPNOTSUPP; if (flags & MSG_OOB) goto read_error; skb = skb_recv_datagram(sk, flags, 0 , &ret); if (!skb) goto read_error; copylen = skb->len; if (len < copylen) { m->msg_flags |= MSG_TRUNC; copylen = len; } ret = skb_copy_datagram_msg(skb, 0, m, copylen); if (ret) goto out_free; ret = (flags & MSG_TRUNC) ? skb->len : copylen; out_free: skb_free_datagram(sk, skb); caif_check_flow_release(sk); return ret; read_error: return ret; } /* Copied from unix_stream_wait_data, identical except for lock call. */ static long caif_stream_data_wait(struct sock *sk, long timeo) { DEFINE_WAIT(wait); lock_sock(sk); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!skb_queue_empty(&sk->sk_receive_queue) || sk->sk_err || sk->sk_state != CAIF_CONNECTED || sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) break; sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); release_sock(sk); return timeo; } /* * Copied from unix_stream_recvmsg, but removed credit checks, * changed locking calls, changed address handling. */ static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int copied = 0; int target; int err = 0; long timeo; err = -EOPNOTSUPP; if (flags&MSG_OOB) goto out; /* * Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ err = -EAGAIN; if (sk->sk_state == CAIF_CONNECTING) goto out; caif_read_lock(sk); target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); do { int chunk; struct sk_buff *skb; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; goto unlock; } skb = skb_dequeue(&sk->sk_receive_queue); caif_check_flow_release(sk); if (skb == NULL) { if (copied >= target) goto unlock; /* * POSIX 1003.1g mandates this order. */ err = sock_error(sk); if (err) goto unlock; err = -ECONNRESET; if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; err = -EPIPE; if (sk->sk_state != CAIF_CONNECTED) goto unlock; if (sock_flag(sk, SOCK_DEAD)) goto unlock; release_sock(sk); err = -EAGAIN; if (!timeo) break; caif_read_unlock(sk); timeo = caif_stream_data_wait(sk, timeo); if (signal_pending(current)) { err = sock_intr_errno(timeo); goto out; } caif_read_lock(sk); continue; unlock: release_sock(sk); break; } release_sock(sk); chunk = min_t(unsigned int, skb->len, size); if (memcpy_to_msg(msg, skb->data, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (copied == 0) copied = -EFAULT; break; } copied += chunk; size -= chunk; /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { skb_pull(skb, chunk); /* put the skb back if we didn't use it up. */ if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); break; } kfree_skb(skb); } else { /* * It is questionable, see note in unix_dgram_recvmsg. */ /* put message back and return */ skb_queue_head(&sk->sk_receive_queue, skb); break; } } while (size); caif_read_unlock(sk); out: return copied ? : err; } /* * Copied from sock.c:sock_wait_for_wmem, but change to wait for * CAIF flow-on and sock_writable. */ static long caif_wait_for_flow_on(struct caifsock *cf_sk, int wait_writeable, long timeo, int *err) { struct sock *sk = &cf_sk->sk; DEFINE_WAIT(wait); for (;;) { *err = 0; if (tx_flow_is_on(cf_sk) && (!wait_writeable || sock_writeable(&cf_sk->sk))) break; *err = -ETIMEDOUT; if (!timeo) break; *err = -ERESTARTSYS; if (signal_pending(current)) break; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); *err = -ECONNRESET; if (sk->sk_shutdown & SHUTDOWN_MASK) break; *err = -sk->sk_err; if (sk->sk_err) break; *err = -EPIPE; if (cf_sk->sk.sk_state != CAIF_CONNECTED) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Transmit a SKB. The device may temporarily request re-transmission * by returning EAGAIN. */ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, int noblock, long timeo) { struct cfpkt *pkt; pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb); memset(skb->cb, 0, sizeof(struct caif_payload_info)); cfpkt_set_prio(pkt, cf_sk->sk.sk_priority); if (cf_sk->layer.dn == NULL) { kfree_skb(skb); return -EINVAL; } return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt); } /* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int buffer_size; int ret = 0; struct sk_buff *skb = NULL; int noblock; long timeo; caif_assert(cf_sk); ret = sock_error(sk); if (ret) goto err; ret = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto err; ret = -EOPNOTSUPP; if (msg->msg_namelen) goto err; ret = -EINVAL; if (unlikely(msg->msg_iter.nr_segs == 0) || unlikely(msg->msg_iter.iov->iov_base == NULL)) goto err; noblock = msg->msg_flags & MSG_DONTWAIT; timeo = sock_sndtimeo(sk, noblock); timeo = caif_wait_for_flow_on(container_of(sk, struct caifsock, sk), 1, timeo, &ret); if (ret) goto err; ret = -EPIPE; if (cf_sk->sk.sk_state != CAIF_CONNECTED || sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) goto err; /* Error if trying to write more than maximum frame size. */ ret = -EMSGSIZE; if (len > cf_sk->maxframe && cf_sk->sk.sk_protocol != CAIFPROTO_RFM) goto err; buffer_size = len + cf_sk->headroom + cf_sk->tailroom; ret = -ENOMEM; skb = sock_alloc_send_skb(sk, buffer_size, noblock, &ret); if (!skb || skb_tailroom(skb) < buffer_size) goto err; skb_reserve(skb, cf_sk->headroom); ret = memcpy_from_msg(skb_put(skb, len), msg, len); if (ret) goto err; ret = transmit_skb(skb, cf_sk, noblock, timeo); if (ret < 0) /* skb is already freed */ return ret; return len; err: kfree_skb(skb); return ret; } /* * Copied from unix_stream_sendmsg and adapted to CAIF: * Changed removed permission handling and added waiting for flow on * and other minor adaptations. */ static int caif_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int err, size; struct sk_buff *skb; int sent = 0; long timeo; err = -EOPNOTSUPP; if (unlikely(msg->msg_flags&MSG_OOB)) goto out_err; if (unlikely(msg->msg_namelen)) goto out_err; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); timeo = caif_wait_for_flow_on(cf_sk, 1, timeo, &err); if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) goto pipe_err; while (sent < len) { size = len-sent; if (size > cf_sk->maxframe) size = cf_sk->maxframe; /* If size is more than half of sndbuf, chop up message */ if (size > ((sk->sk_sndbuf >> 1) - 64)) size = (sk->sk_sndbuf >> 1) - 64; if (size > SKB_MAX_ALLOC) size = SKB_MAX_ALLOC; skb = sock_alloc_send_skb(sk, size + cf_sk->headroom + cf_sk->tailroom, msg->msg_flags&MSG_DONTWAIT, &err); if (skb == NULL) goto out_err; skb_reserve(skb, cf_sk->headroom); /* * If you pass two values to the sock_alloc_send_skb * it tries to grab the large buffer with GFP_NOFS * (which can fail easily), and if it fails grab the * fallback size buffer which is under a page and will * succeed. [Alan] */ size = min_t(int, size, skb_tailroom(skb)); err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err) { kfree_skb(skb); goto out_err; } err = transmit_skb(skb, cf_sk, msg->msg_flags&MSG_DONTWAIT, timeo); if (err < 0) /* skb is already freed */ goto pipe_err; sent += size; } return sent; pipe_err: if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: return sent ? : err; } static int setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov, unsigned int ol) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int linksel; if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED) return -ENOPROTOOPT; switch (opt) { case CAIFSO_LINK_SELECT: if (ol < sizeof(int)) return -EINVAL; if (lvl != SOL_CAIF) goto bad_sol; if (copy_from_sockptr(&linksel, ov, sizeof(int))) return -EINVAL; lock_sock(&(cf_sk->sk)); cf_sk->conn_req.link_selector = linksel; release_sock(&cf_sk->sk); return 0; case CAIFSO_REQ_PARAM: if (lvl != SOL_CAIF) goto bad_sol; if (cf_sk->sk.sk_protocol != CAIFPROTO_UTIL) return -ENOPROTOOPT; lock_sock(&(cf_sk->sk)); if (ol > sizeof(cf_sk->conn_req.param.data) || copy_from_sockptr(&cf_sk->conn_req.param.data, ov, ol)) { release_sock(&cf_sk->sk); return -EINVAL; } cf_sk->conn_req.param.size = ol; release_sock(&cf_sk->sk); return 0; default: return -ENOPROTOOPT; } return 0; bad_sol: return -ENOPROTOOPT; } /* * caif_connect() - Connect a CAIF Socket * Copied and modified af_irda.c:irda_connect(). * * Note : by consulting "errno", the user space caller may learn the cause * of the failure. Most of them are visible in the function, others may come * from subroutines called and are listed here : * o -EAFNOSUPPORT: bad socket family or type. * o -ESOCKTNOSUPPORT: bad socket type or protocol * o -EINVAL: bad socket address, or CAIF link type * o -ECONNREFUSED: remote end refused the connection. * o -EINPROGRESS: connect request sent but timed out (or non-blocking) * o -EISCONN: already connected. * o -ETIMEDOUT: Connection timed out (send timeout) * o -ENODEV: No link layer to send request * o -ECONNRESET: Received Shutdown indication or lost link layer * o -ENOMEM: Out of memory * * State Strategy: * o sk_state: holds the CAIF_* protocol state, it's updated by * caif_ctrl_cb. * o sock->state: holds the SS_* socket state and is updated by connect and * disconnect. */ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); long timeo; int err; int ifindex, headroom, tailroom; unsigned int mtu; struct net_device *dev; lock_sock(sk); err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) goto out; err = -EAFNOSUPPORT; if (uaddr->sa_family != AF_CAIF) goto out; switch (sock->state) { case SS_UNCONNECTED: /* Normal case, a fresh connect */ caif_assert(sk->sk_state == CAIF_DISCONNECTED); break; case SS_CONNECTING: switch (sk->sk_state) { case CAIF_CONNECTED: sock->state = SS_CONNECTED; err = -EISCONN; goto out; case CAIF_DISCONNECTED: /* Reconnect allowed */ break; case CAIF_CONNECTING: err = -EALREADY; if (flags & O_NONBLOCK) goto out; goto wait_connect; } break; case SS_CONNECTED: caif_assert(sk->sk_state == CAIF_CONNECTED || sk->sk_state == CAIF_DISCONNECTED); if (sk->sk_shutdown & SHUTDOWN_MASK) { /* Allow re-connect after SHUTDOWN_IND */ caif_disconnect_client(sock_net(sk), &cf_sk->layer); caif_free_client(&cf_sk->layer); break; } /* No reconnect on a seqpacket socket */ err = -EISCONN; goto out; case SS_DISCONNECTING: case SS_FREE: caif_assert(1); /*Should never happen */ break; } sk->sk_state = CAIF_DISCONNECTED; sock->state = SS_UNCONNECTED; sk_stream_kill_queues(&cf_sk->sk); err = -EINVAL; if (addr_len != sizeof(struct sockaddr_caif)) goto out; memcpy(&cf_sk->conn_req.sockaddr, uaddr, sizeof(struct sockaddr_caif)); /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = CAIF_CONNECTING; /* Check priority value comming from socket */ /* if priority value is out of range it will be ajusted */ if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX) cf_sk->conn_req.priority = CAIF_PRIO_MAX; else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN) cf_sk->conn_req.priority = CAIF_PRIO_MIN; else cf_sk->conn_req.priority = cf_sk->sk.sk_priority; /*ifindex = id of the interface.*/ cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if; cf_sk->layer.receive = caif_sktrecv_cb; err = caif_connect_client(sock_net(sk), &cf_sk->conn_req, &cf_sk->layer, &ifindex, &headroom, &tailroom); if (err < 0) { cf_sk->sk.sk_socket->state = SS_UNCONNECTED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; goto out; } err = -ENODEV; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { rcu_read_unlock(); goto out; } cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom); mtu = dev->mtu; rcu_read_unlock(); cf_sk->tailroom = tailroom; cf_sk->maxframe = mtu - (headroom + tailroom); if (cf_sk->maxframe < 1) { pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu); err = -ENODEV; goto out; } err = -EINPROGRESS; wait_connect: if (sk->sk_state != CAIF_CONNECTED && (flags & O_NONBLOCK)) goto out; timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); release_sock(sk); err = -ERESTARTSYS; timeo = wait_event_interruptible_timeout(*sk_sleep(sk), sk->sk_state != CAIF_CONNECTING, timeo); lock_sock(sk); if (timeo < 0) goto out; /* -ERESTARTSYS */ err = -ETIMEDOUT; if (timeo == 0 && sk->sk_state != CAIF_CONNECTED) goto out; if (sk->sk_state != CAIF_CONNECTED) { sock->state = SS_UNCONNECTED; err = sock_error(sk); if (!err) err = -ECONNREFUSED; goto out; } sock->state = SS_CONNECTED; err = 0; out: release_sock(sk); return err; } /* * caif_release() - Disconnect a CAIF Socket * Copied and modified af_irda.c:irda_release(). */ static int caif_release(struct socket *sock) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); if (!sk) return 0; set_tx_flow_off(cf_sk); /* * Ensure that packets are not queued after this point in time. * caif_queue_rcv_skb checks SOCK_DEAD holding the queue lock, * this ensures no packets when sock is dead. */ spin_lock_bh(&sk->sk_receive_queue.lock); sock_set_flag(sk, SOCK_DEAD); spin_unlock_bh(&sk->sk_receive_queue.lock); sock->sk = NULL; WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir)); debugfs_remove_recursive(cf_sk->debugfs_socket_dir); lock_sock(&(cf_sk->sk)); sk->sk_state = CAIF_DISCONNECTED; sk->sk_shutdown = SHUTDOWN_MASK; caif_disconnect_client(sock_net(sk), &cf_sk->layer); cf_sk->sk.sk_socket->state = SS_DISCONNECTING; wake_up_interruptible_poll(sk_sleep(sk), EPOLLERR|EPOLLHUP); sock_orphan(sk); sk_stream_kill_queues(&cf_sk->sk); release_sock(sk); sock_put(sk); return 0; } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ static __poll_t caif_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ if (sk->sk_err) mask |= EPOLLERR; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) mask |= EPOLLIN | EPOLLRDNORM; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (sock_writeable(sk) && tx_flow_is_on(cf_sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static const struct proto_ops caif_seqpacket_ops = { .family = PF_CAIF, .owner = THIS_MODULE, .release = caif_release, .bind = sock_no_bind, .connect = caif_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, .sendmsg = caif_seqpkt_sendmsg, .recvmsg = caif_seqpkt_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; static const struct proto_ops caif_stream_ops = { .family = PF_CAIF, .owner = THIS_MODULE, .release = caif_release, .bind = sock_no_bind, .connect = caif_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, .sendmsg = caif_stream_sendmsg, .recvmsg = caif_stream_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; /* This function is called when a socket is finally destroyed. */ static void caif_sock_destructor(struct sock *sk) { struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); caif_assert(!refcount_read(&sk->sk_wmem_alloc)); caif_assert(sk_unhashed(sk)); caif_assert(!sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_debug("Attempt to release alive CAIF socket: %p\n", sk); return; } sk_stream_kill_queues(&cf_sk->sk); WARN_ON(sk->sk_forward_alloc); caif_free_client(&cf_sk->layer); } static int caif_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk = NULL; struct caifsock *cf_sk = NULL; static struct proto prot = {.name = "PF_CAIF", .owner = THIS_MODULE, .obj_size = sizeof(struct caifsock), .useroffset = offsetof(struct caifsock, conn_req.param), .usersize = sizeof_field(struct caifsock, conn_req.param) }; if (!capable(CAP_SYS_ADMIN) && !capable(CAP_NET_ADMIN)) return -EPERM; /* * The sock->type specifies the socket type to use. * The CAIF socket is a packet stream in the sense * that it is packet based. CAIF trusts the reliability * of the link, no resending is implemented. */ if (sock->type == SOCK_SEQPACKET) sock->ops = &caif_seqpacket_ops; else if (sock->type == SOCK_STREAM) sock->ops = &caif_stream_ops; else return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= CAIFPROTO_MAX) return -EPROTONOSUPPORT; /* * Set the socket state to unconnected. The socket state * is really not used at all in the net/core or socket.c but the * initialization makes sure that sock->state is not uninitialized. */ sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern); if (!sk) return -ENOMEM; cf_sk = container_of(sk, struct caifsock, sk); /* Store the protocol */ sk->sk_protocol = (unsigned char) protocol; /* Initialize default priority for well-known cases */ switch (protocol) { case CAIFPROTO_AT: sk->sk_priority = TC_PRIO_CONTROL; break; case CAIFPROTO_RFM: sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; break; default: sk->sk_priority = TC_PRIO_BESTEFFORT; } /* * Lock in order to try to stop someone from opening the socket * too early. */ lock_sock(&(cf_sk->sk)); /* Initialize the nozero default sock structure data. */ sock_init_data(sock, sk); sk->sk_destruct = caif_sock_destructor; mutex_init(&cf_sk->readlock); /* single task reading lock */ cf_sk->layer.ctrlcmd = caif_ctrl_cb; cf_sk->sk.sk_socket->state = SS_UNCONNECTED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; set_tx_flow_off(cf_sk); set_rx_flow_on(cf_sk); /* Set default options on configuration */ cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY; cf_sk->conn_req.protocol = protocol; release_sock(&cf_sk->sk); return 0; } static const struct net_proto_family caif_family_ops = { .family = PF_CAIF, .create = caif_create, .owner = THIS_MODULE, }; static int __init caif_sktinit_module(void) { return sock_register(&caif_family_ops); } static void __exit caif_sktexit_module(void) { sock_unregister(PF_CAIF); } module_init(caif_sktinit_module); module_exit(caif_sktexit_module);
5493 5567 486 5618 16 1124 2383 14 4 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SEQLOCK_H #define __LINUX_SEQLOCK_H /* * seqcount_t / seqlock_t - a reader-writer consistency mechanism with * lockless readers (read-only retry loops), and no writer starvation. * * See Documentation/locking/seqlock.rst * * Copyrights: * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH */ #include <linux/compiler.h> #include <linux/kcsan-checks.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/ww_mutex.h> #include <linux/preempt.h> #include <linux/spinlock.h> #include <asm/processor.h> /* * The seqlock seqcount_t interface does not prescribe a precise sequence of * read begin/retry/end. For readers, typically there is a call to * read_seqcount_begin() and read_seqcount_retry(), however, there are more * esoteric cases which do not follow this pattern. * * As a consequence, we take the following best-effort approach for raw usage * via seqcount_t under KCSAN: upon beginning a seq-reader critical section, * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as * atomics; if there is a matching read_seqcount_retry() call, no following * memory operations are considered atomic. Usage of the seqlock_t interface * is not affected. */ #define KCSAN_SEQLOCK_REGION_MAX 1000 /* * Sequence counters (seqcount_t) * * This is the raw counting mechanism, without any writer protection. * * Write side critical sections must be serialized and non-preemptible. * * If readers can be invoked from hardirq or softirq contexts, * interrupts or bottom halves must also be respectively disabled before * entering the write section. * * This mechanism can't be used if the protected data contains pointers, * as the writer can invalidate a pointer that a reader is following. * * If the write serialization mechanism is one of the common kernel * locking primitives, use a sequence counter with associated lock * (seqcount_LOCKNAME_t) instead. * * If it's desired to automatically handle the sequence counter writer * serialization and non-preemptibility requirements, use a sequential * lock (seqlock_t) instead. * * See Documentation/locking/seqlock.rst */ typedef struct seqcount { unsigned sequence; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } seqcount_t; static inline void __seqcount_init(seqcount_t *s, const char *name, struct lock_class_key *key) { /* * Make sure we are not reinitializing a held lock: */ lockdep_init_map(&s->dep_map, name, key, 0); s->sequence = 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC # define SEQCOUNT_DEP_MAP_INIT(lockname) \ .dep_map = { .name = #lockname } /** * seqcount_init() - runtime initializer for seqcount_t * @s: Pointer to the seqcount_t instance */ # define seqcount_init(s) \ do { \ static struct lock_class_key __key; \ __seqcount_init((s), #s, &__key); \ } while (0) static inline void seqcount_lockdep_reader_access(const seqcount_t *s) { seqcount_t *l = (seqcount_t *)s; unsigned long flags; local_irq_save(flags); seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_); seqcount_release(&l->dep_map, _RET_IP_); local_irq_restore(flags); } #else # define SEQCOUNT_DEP_MAP_INIT(lockname) # define seqcount_init(s) __seqcount_init(s, NULL, NULL) # define seqcount_lockdep_reader_access(x) #endif /** * SEQCNT_ZERO() - static initializer for seqcount_t * @name: Name of the seqcount_t instance */ #define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) } /* * Sequence counters with associated locks (seqcount_LOCKNAME_t) * * A sequence counter which associates the lock used for writer * serialization at initialization time. This enables lockdep to validate * that the write side critical section is properly serialized. * * For associated locks which do not implicitly disable preemption, * preemption protection is enforced in the write side function. * * Lockdep is never used in any for the raw write variants. * * See Documentation/locking/seqlock.rst */ /* * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot * disable preemption. It can lead to higher latencies, and the write side * sections will not be able to acquire locks which become sleeping locks * (e.g. spinlock_t). * * To remain preemptible while avoiding a possible livelock caused by the * reader preempting the writer, use a different technique: let the reader * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the * case, acquire then release the associated LOCKNAME writer serialization * lock. This will allow any possibly-preempted writer to make progress * until the end of its writer serialization lock critical section. * * This lock-unlock technique must be implemented for all of PREEMPT_RT * sleeping locks. See Documentation/locking/locktypes.rst */ #if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) #define __SEQ_LOCK(expr) expr #else #define __SEQ_LOCK(expr) #endif /* * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated * @seqcount: The real sequence counter * @lock: Pointer to the associated lock * * A plain sequence counter with external writer synchronization by * LOCKNAME @lock. The lock is associated to the sequence counter in the * static initializer or init function. This enables lockdep to validate * that the write side critical section is properly serialized. * * LOCKNAME: raw_spinlock, spinlock, rwlock, mutex, or ww_mutex. */ /* * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t * @s: Pointer to the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated lock */ #define seqcount_LOCKNAME_init(s, _lock, lockname) \ do { \ seqcount_##lockname##_t *____s = (s); \ seqcount_init(&____s->seqcount); \ __SEQ_LOCK(____s->lock = (_lock)); \ } while (0) #define seqcount_raw_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, raw_spinlock) #define seqcount_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, spinlock) #define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock) #define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex) #define seqcount_ww_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, ww_mutex) /* * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t * * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @locktype: LOCKNAME canonical C data type * @preemptible: preemptibility of above locktype * @lockmember: argument for lockdep_assert_held() * @lockbase: associated lock release function (prefix only) * @lock_acquire: associated lock acquisition function (full call) */ #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \ typedef struct seqcount_##lockname { \ seqcount_t seqcount; \ __SEQ_LOCK(locktype *lock); \ } seqcount_##lockname##_t; \ \ static __always_inline seqcount_t * \ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline unsigned \ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ { \ unsigned seq = READ_ONCE(s->seqcount.sequence); \ \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return seq; \ \ if (preemptible && unlikely(seq & 1)) { \ __SEQ_LOCK(lock_acquire); \ __SEQ_LOCK(lockbase##_unlock(s->lock)); \ \ /* \ * Re-read the sequence counter since the (possibly \ * preempted) writer made progress. \ */ \ seq = READ_ONCE(s->seqcount.sequence); \ } \ \ return seq; \ } \ \ static __always_inline bool \ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return preemptible; \ \ /* PREEMPT_RT relies on the above LOCK+UNLOCK */ \ return false; \ } \ \ static __always_inline void \ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ { \ __SEQ_LOCK(lockdep_assert_held(lockmember)); \ } /* * __seqprop() for seqcount_t */ static inline seqcount_t *__seqprop_ptr(seqcount_t *s) { return s; } static inline unsigned __seqprop_sequence(const seqcount_t *s) { return READ_ONCE(s->sequence); } static inline bool __seqprop_preemptible(const seqcount_t *s) { return false; } static inline void __seqprop_assert(const seqcount_t *s) { lockdep_assert_preemption_disabled(); } #define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, s->lock, raw_spin, raw_spin_lock(s->lock)) SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, s->lock, spin, spin_lock(s->lock)) SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, s->lock, read, read_lock(s->lock)) SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex, mutex_lock(s->lock)) SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mutex, ww_mutex_lock(s->lock, NULL)) /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t * @name: Name of the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated LOCKNAME */ #define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ __SEQ_LOCK(.lock = (assoc_lock)) \ } #define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define __seqprop_case(s, lockname, prop) \ seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s)) #define __seqprop(s, prop) _Generic(*(s), \ seqcount_t: __seqprop_##prop((void *)(s)), \ __seqprop_case((s), raw_spinlock, prop), \ __seqprop_case((s), spinlock, prop), \ __seqprop_case((s), rwlock, prop), \ __seqprop_case((s), mutex, prop), \ __seqprop_case((s), ww_mutex, prop)) #define seqprop_ptr(s) __seqprop(s, ptr) #define seqprop_sequence(s) __seqprop(s, sequence) #define seqprop_preemptible(s) __seqprop(s, preemptible) #define seqprop_assert(s) __seqprop(s, assert) /** * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is * provided before actually loading any of the variables that are to be * protected in this critical section. * * Use carefully, only in critical code, and comment how the barrier is * provided. * * Return: count to be passed to read_seqcount_retry() */ #define __read_seqcount_begin(s) \ ({ \ unsigned __seq; \ \ while ((__seq = seqprop_sequence(s)) & 1) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) /** * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount_begin(s) \ ({ \ unsigned _seq = __read_seqcount_begin(s); \ \ smp_rmb(); \ _seq; \ }) /** * read_seqcount_begin() - begin a seqcount_t read critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define read_seqcount_begin(s) \ ({ \ seqcount_lockdep_reader_access(seqprop_ptr(s)); \ raw_read_seqcount_begin(s); \ }) /** * raw_read_seqcount() - read the raw seqcount_t counter value * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_read_seqcount opens a read critical section of the given * seqcount_t, without any lockdep checking, and without checking or * masking the sequence counter LSB. Calling code is responsible for * handling that. * * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount(s) \ ({ \ unsigned __seq = seqprop_sequence(s); \ \ smp_rmb(); \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_seqcount_begin opens a read critical section of the given * seqcount_t. Unlike read_seqcount_begin(), this function will not wait * for the count to stabilize. If a writer is active when it begins, it * will fail the read_seqcount_retry() at the end of the read critical * section instead of stabilizing at the beginning of it. * * Use this only in special kernel hot paths where the read section is * small and has a high probability of success through other external * means. It will save a single branching instruction. * * Return: count to be passed to read_seqcount_retry() */ #define raw_seqcount_begin(s) \ ({ \ /* \ * If the counter is odd, let read_seqcount_retry() fail \ * by decrementing the counter. \ */ \ raw_read_seqcount(s) & ~1; \ }) /** * __read_seqcount_retry() - end a seqcount_t read section w/o barrier * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is * provided before actually loading any of the variables that are to be * protected in this critical section. * * Use carefully, only in critical code, and comment how the barrier is * provided. * * Return: true if a read section retry is required, else false */ #define __read_seqcount_retry(s, start) \ do___read_seqcount_retry(seqprop_ptr(s), start) static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) { kcsan_atomic_next(0); return unlikely(READ_ONCE(s->sequence) != start); } /** * read_seqcount_retry() - end a seqcount_t read critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * read_seqcount_retry closes the read critical section of given * seqcount_t. If the critical section was invalid, it must be ignored * (and typically retried). * * Return: true if a read section retry is required, else false */ #define read_seqcount_retry(s, start) \ do_read_seqcount_retry(seqprop_ptr(s), start) static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start) { smp_rmb(); return do___read_seqcount_retry(s, start); } /** * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: check write_seqcount_begin() */ #define raw_write_seqcount_begin(s) \ do { \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_raw_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) static inline void do_raw_write_seqcount_begin(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); } /** * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: check write_seqcount_end() */ #define raw_write_seqcount_end(s) \ do { \ do_raw_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) static inline void do_raw_write_seqcount_end(seqcount_t *s) { smp_wmb(); s->sequence++; kcsan_nestable_atomic_end(); } /** * write_seqcount_begin_nested() - start a seqcount_t write section with * custom lockdep nesting level * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @subclass: lockdep nesting level * * See Documentation/locking/lockdep-design.rst * Context: check write_seqcount_begin() */ #define write_seqcount_begin_nested(s, subclass) \ do { \ seqprop_assert(s); \ \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_write_seqcount_begin_nested(seqprop_ptr(s), subclass); \ } while (0) static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass) { do_raw_write_seqcount_begin(s); seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); } /** * write_seqcount_begin() - start a seqcount_t write side critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: sequence counter write side sections must be serialized and * non-preemptible. Preemption will be automatically disabled if and * only if the seqcount write serialization lock is associated, and * preemptible. If readers can be invoked from hardirq or softirq * context, interrupts or bottom halves must be respectively disabled. */ #define write_seqcount_begin(s) \ do { \ seqprop_assert(s); \ \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) static inline void do_write_seqcount_begin(seqcount_t *s) { do_write_seqcount_begin_nested(s, 0); } /** * write_seqcount_end() - end a seqcount_t write side critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: Preemption will be automatically re-enabled if and only if * the seqcount write serialization lock is associated, and preemptible. */ #define write_seqcount_end(s) \ do { \ do_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) static inline void do_write_seqcount_end(seqcount_t *s) { seqcount_release(&s->dep_map, _RET_IP_); do_raw_write_seqcount_end(s); } /** * raw_write_seqcount_barrier() - do a seqcount_t write barrier * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * This can be used to provide an ordering guarantee instead of the usual * consistency guarantee. It is one wmb cheaper, because it can collapse * the two back-to-back wmb()s. * * Note that writes surrounding the barrier should be declared atomic (e.g. * via WRITE_ONCE): a) to ensure the writes become visible to other threads * atomically, avoiding compiler optimizations; b) to document which writes are * meant to propagate to the reader critical section. This is necessary because * neither writes before and after the barrier are enclosed in a seq-writer * critical section that would ensure readers are aware of ongoing writes:: * * seqcount_t seq; * bool X = true, Y = false; * * void read(void) * { * bool x, y; * * do { * int s = read_seqcount_begin(&seq); * * x = X; y = Y; * * } while (read_seqcount_retry(&seq, s)); * * BUG_ON(!x && !y); * } * * void write(void) * { * WRITE_ONCE(Y, true); * * raw_write_seqcount_barrier(seq); * * WRITE_ONCE(X, false); * } */ #define raw_write_seqcount_barrier(s) \ do_raw_write_seqcount_barrier(seqprop_ptr(s)) static inline void do_raw_write_seqcount_barrier(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); s->sequence++; kcsan_nestable_atomic_end(); } /** * write_seqcount_invalidate() - invalidate in-progress seqcount_t read * side operations * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * After write_seqcount_invalidate, no seqcount_t read side operations * will complete successfully and see data older than this. */ #define write_seqcount_invalidate(s) \ do_write_seqcount_invalidate(seqprop_ptr(s)) static inline void do_write_seqcount_invalidate(seqcount_t *s) { smp_wmb(); kcsan_nestable_atomic_begin(); s->sequence+=2; kcsan_nestable_atomic_end(); } /* * Latch sequence counters (seqcount_latch_t) * * A sequence counter variant where the counter even/odd value is used to * switch between two copies of protected data. This allows the read path, * typically NMIs, to safely interrupt the write side critical section. * * As the write sections are fully preemptible, no special handling for * PREEMPT_RT is needed. */ typedef struct { seqcount_t seqcount; } seqcount_latch_t; /** * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t * @seq_name: Name of the seqcount_latch_t instance */ #define SEQCNT_LATCH_ZERO(seq_name) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ } /** * seqcount_latch_init() - runtime initializer for seqcount_latch_t * @s: Pointer to the seqcount_latch_t instance */ #define seqcount_latch_init(s) seqcount_init(&(s)->seqcount) /** * raw_read_seqcount_latch() - pick even/odd latch data copy * @s: Pointer to seqcount_latch_t * * See raw_write_seqcount_latch() for details and a full reader/writer * usage example. * * Return: sequence counter raw value. Use the lowest bit as an index for * picking which data copy to read. The full counter must then be checked * with read_seqcount_latch_retry(). */ static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) { /* * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). * Due to the dependent load, a full smp_rmb() is not needed. */ return READ_ONCE(s->seqcount.sequence); } /** * read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t * @start: count, from raw_read_seqcount_latch() * * Return: true if a read section retry is required, else false */ static inline int read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) { return read_seqcount_retry(&s->seqcount, start); } /** * raw_write_seqcount_latch() - redirect latch readers to even/odd copy * @s: Pointer to seqcount_latch_t * * The latch technique is a multiversion concurrency control method that allows * queries during non-atomic modifications. If you can guarantee queries never * interrupt the modification -- e.g. the concurrency is strictly between CPUs * -- you most likely do not need this. * * Where the traditional RCU/lockless data structures rely on atomic * modifications to ensure queries observe either the old or the new state the * latch allows the same for non-atomic updates. The trade-off is doubling the * cost of storage; we have to maintain two copies of the entire data * structure. * * Very simply put: we first modify one copy and then the other. This ensures * there is always one copy in a stable state, ready to give us an answer. * * The basic form is a data structure like:: * * struct latch_struct { * seqcount_latch_t seq; * struct data_struct data[2]; * }; * * Where a modification, which is assumed to be externally serialized, does the * following:: * * void latch_modify(struct latch_struct *latch, ...) * { * smp_wmb(); // Ensure that the last data[1] update is visible * latch->seq.sequence++; * smp_wmb(); // Ensure that the seqcount update is visible * * modify(latch->data[0], ...); * * smp_wmb(); // Ensure that the data[0] update is visible * latch->seq.sequence++; * smp_wmb(); // Ensure that the seqcount update is visible * * modify(latch->data[1], ...); * } * * The query will have a form like:: * * struct entry *latch_query(struct latch_struct *latch, ...) * { * struct entry *entry; * unsigned seq, idx; * * do { * seq = raw_read_seqcount_latch(&latch->seq); * * idx = seq & 0x01; * entry = data_query(latch->data[idx], ...); * * // This includes needed smp_rmb() * } while (read_seqcount_latch_retry(&latch->seq, seq)); * * return entry; * } * * So during the modification, queries are first redirected to data[1]. Then we * modify data[0]. When that is complete, we redirect queries back to data[0] * and we can modify data[1]. * * NOTE: * * The non-requirement for atomic modifications does _NOT_ include * the publishing of new entries in the case where data is a dynamic * data structure. * * An iteration might start in data[0] and get suspended long enough * to miss an entire modification sequence, once it resumes it might * observe the new entry. * * NOTE2: * * When data is a dynamic data structure; one should use regular RCU * patterns to manage the lifetimes of the objects within. */ static inline void raw_write_seqcount_latch(seqcount_latch_t *s) { smp_wmb(); /* prior stores before incrementing "sequence" */ s->seqcount.sequence++; smp_wmb(); /* increment "sequence" before following stores */ } /* * Sequential locks (seqlock_t) * * Sequence counters with an embedded spinlock for writer serialization * and non-preemptibility. * * For more info, see: * - Comments on top of seqcount_t * - Documentation/locking/seqlock.rst */ typedef struct { /* * Make sure that readers don't starve writers on PREEMPT_RT: use * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK(). */ seqcount_spinlock_t seqcount; spinlock_t lock; } seqlock_t; #define __SEQLOCK_UNLOCKED(lockname) \ { \ .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \ .lock = __SPIN_LOCK_UNLOCKED(lockname) \ } /** * seqlock_init() - dynamic initializer for seqlock_t * @sl: Pointer to the seqlock_t instance */ #define seqlock_init(sl) \ do { \ spin_lock_init(&(sl)->lock); \ seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock); \ } while (0) /** * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t * @sl: Name of the seqlock_t instance */ #define DEFINE_SEQLOCK(sl) \ seqlock_t sl = __SEQLOCK_UNLOCKED(sl) /** * read_seqbegin() - start a seqlock_t read side critical section * @sl: Pointer to seqlock_t * * Return: count, to be passed to read_seqretry() */ static inline unsigned read_seqbegin(const seqlock_t *sl) { unsigned ret = read_seqcount_begin(&sl->seqcount); kcsan_atomic_next(0); /* non-raw usage, assume closing read_seqretry() */ kcsan_flat_atomic_begin(); return ret; } /** * read_seqretry() - end a seqlock_t read side section * @sl: Pointer to seqlock_t * @start: count, from read_seqbegin() * * read_seqretry closes the read side critical section of given seqlock_t. * If the critical section was invalid, it must be ignored (and typically * retried). * * Return: true if a read section retry is required, else false */ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) { /* * Assume not nested: read_seqretry() may be called multiple times when * completing read critical section. */ kcsan_flat_atomic_end(); return read_seqcount_retry(&sl->seqcount, start); } /* * For all seqlock_t write side functions, use the the internal * do_write_seqcount_begin() instead of generic write_seqcount_begin(). * This way, no redundant lockdep_assert_held() checks are added. */ /** * write_seqlock() - start a seqlock_t write side critical section * @sl: Pointer to seqlock_t * * write_seqlock opens a write side critical section for the given * seqlock_t. It also implicitly acquires the spinlock_t embedded inside * that sequential lock. All seqlock_t write side sections are thus * automatically serialized and non-preemptible. * * Context: if the seqlock_t read section, or other write side critical * sections, can be invoked from hardirq or softirq contexts, use the * _irqsave or _bh variants of this function instead. */ static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock() - end a seqlock_t write side critical section * @sl: Pointer to seqlock_t * * write_sequnlock closes the (serialized and non-preemptible) write side * critical section of given seqlock_t. */ static inline void write_sequnlock(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock(&sl->lock); } /** * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section * @sl: Pointer to seqlock_t * * _bh variant of write_seqlock(). Use only if the read side section, or * other write side sections, can be invoked from softirq contexts. */ static inline void write_seqlock_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section * @sl: Pointer to seqlock_t * * write_sequnlock_bh closes the serialized, non-preemptible, and * softirqs-disabled, seqlock_t write side critical section opened with * write_seqlock_bh(). */ static inline void write_sequnlock_bh(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_bh(&sl->lock); } /** * write_seqlock_irq() - start a non-interruptible seqlock_t write section * @sl: Pointer to seqlock_t * * _irq variant of write_seqlock(). Use only if the read side section, or * other write sections, can be invoked from hardirq contexts. */ static inline void write_seqlock_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock_irq() - end a non-interruptible seqlock_t write section * @sl: Pointer to seqlock_t * * write_sequnlock_irq closes the serialized and non-interruptible * seqlock_t write side section opened with write_seqlock_irq(). */ static inline void write_sequnlock_irq(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irq(&sl->lock); } static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) { unsigned long flags; spin_lock_irqsave(&sl->lock, flags); do_write_seqcount_begin(&sl->seqcount.seqcount); return flags; } /** * write_seqlock_irqsave() - start a non-interruptible seqlock_t write * section * @lock: Pointer to seqlock_t * @flags: Stack-allocated storage for saving caller's local interrupt * state, to be passed to write_sequnlock_irqrestore(). * * _irqsave variant of write_seqlock(). Use it only if the read side * section, or other write sections, can be invoked from hardirq context. */ #define write_seqlock_irqsave(lock, flags) \ do { flags = __write_seqlock_irqsave(lock); } while (0) /** * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write * section * @sl: Pointer to seqlock_t * @flags: Caller's saved interrupt state, from write_seqlock_irqsave() * * write_sequnlock_irqrestore closes the serialized and non-interruptible * seqlock_t write section previously opened with write_seqlock_irqsave(). */ static inline void write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irqrestore(&sl->lock, flags); } /** * read_seqlock_excl() - begin a seqlock_t locking reader section * @sl: Pointer to seqlock_t * * read_seqlock_excl opens a seqlock_t locking reader critical section. A * locking reader exclusively locks out *both* other writers *and* other * locking readers, but it does not update the embedded sequence number. * * Locking readers act like a normal spin_lock()/spin_unlock(). * * Context: if the seqlock_t write section, *or other read sections*, can * be invoked from hardirq or softirq contexts, use the _irqsave or _bh * variant of this function instead. * * The opened read section must be closed with read_sequnlock_excl(). */ static inline void read_seqlock_excl(seqlock_t *sl) { spin_lock(&sl->lock); } /** * read_sequnlock_excl() - end a seqlock_t locking reader critical section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl(seqlock_t *sl) { spin_unlock(&sl->lock); } /** * read_seqlock_excl_bh() - start a seqlock_t locking reader section with * softirqs disabled * @sl: Pointer to seqlock_t * * _bh variant of read_seqlock_excl(). Use this variant only if the * seqlock_t write side section, *or other read sections*, can be invoked * from softirq contexts. */ static inline void read_seqlock_excl_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); } /** * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking * reader section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl_bh(seqlock_t *sl) { spin_unlock_bh(&sl->lock); } /** * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking * reader section * @sl: Pointer to seqlock_t * * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t * write side section, *or other read sections*, can be invoked from a * hardirq context. */ static inline void read_seqlock_excl_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); } /** * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t * locking reader section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl_irq(seqlock_t *sl) { spin_unlock_irq(&sl->lock); } static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl) { unsigned long flags; spin_lock_irqsave(&sl->lock, flags); return flags; } /** * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t * locking reader section * @lock: Pointer to seqlock_t * @flags: Stack-allocated storage for saving caller's local interrupt * state, to be passed to read_sequnlock_excl_irqrestore(). * * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t * write side section, *or other read sections*, can be invoked from a * hardirq context. */ #define read_seqlock_excl_irqsave(lock, flags) \ do { flags = __read_seqlock_excl_irqsave(lock); } while (0) /** * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t * locking reader section * @sl: Pointer to seqlock_t * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave() */ static inline void read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) { spin_unlock_irqrestore(&sl->lock, flags); } /** * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader * @lock: Pointer to seqlock_t * @seq : Marker and return parameter. If the passed value is even, the * reader will become a *lockless* seqlock_t reader as in read_seqbegin(). * If the passed value is odd, the reader will become a *locking* reader * as in read_seqlock_excl(). In the first call to this function, the * caller *must* initialize and pass an even value to @seq; this way, a * lockless read can be optimistically tried first. * * read_seqbegin_or_lock is an API designed to optimistically try a normal * lockless seqlock_t read section first. If an odd counter is found, the * lockless read trial has failed, and the next read iteration transforms * itself into a full seqlock_t locking reader. * * This is typically used to avoid seqlock_t lockless readers starvation * (too much retry loops) in the case of a sharp spike in write side * activity. * * Context: if the seqlock_t write section, *or other read sections*, can * be invoked from hardirq or softirq contexts, use the _irqsave or _bh * variant of this function instead. * * Check Documentation/locking/seqlock.rst for template example code. * * Return: the encountered sequence counter value, through the @seq * parameter, which is overloaded as a return parameter. This returned * value must be checked with need_seqretry(). If the read section need to * be retried, this returned value must also be passed as the @seq * parameter of the next read_seqbegin_or_lock() iteration. */ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) { if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ read_seqlock_excl(lock); } /** * need_seqretry() - validate seqlock_t "locking or lockless" read section * @lock: Pointer to seqlock_t * @seq: sequence count, from read_seqbegin_or_lock() * * Return: true if a read section retry is required, false otherwise */ static inline int need_seqretry(seqlock_t *lock, int seq) { return !(seq & 1) && read_seqretry(lock, seq); } /** * done_seqretry() - end seqlock_t "locking or lockless" reader section * @lock: Pointer to seqlock_t * @seq: count, from read_seqbegin_or_lock() * * done_seqretry finishes the seqlock_t read side critical section started * with read_seqbegin_or_lock() and validated by need_seqretry(). */ static inline void done_seqretry(seqlock_t *lock, int seq) { if (seq & 1) read_sequnlock_excl(lock); } /** * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or * a non-interruptible locking reader * @lock: Pointer to seqlock_t * @seq: Marker and return parameter. Check read_seqbegin_or_lock(). * * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if * the seqlock_t write section, *or other read sections*, can be invoked * from hardirq context. * * Note: Interrupts will be disabled only for "locking reader" mode. * * Return: * * 1. The saved local interrupts state in case of a locking reader, to * be passed to done_seqretry_irqrestore(). * * 2. The encountered sequence counter value, returned through @seq * overloaded as a return parameter. Check read_seqbegin_or_lock(). */ static inline unsigned long read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) { unsigned long flags = 0; if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ read_seqlock_excl_irqsave(lock, flags); return flags; } /** * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a * non-interruptible locking reader section * @lock: Pointer to seqlock_t * @seq: Count, from read_seqbegin_or_lock_irqsave() * @flags: Caller's saved local interrupt state in case of a locking * reader, also from read_seqbegin_or_lock_irqsave() * * This is the _irqrestore variant of done_seqretry(). The read section * must've been opened with read_seqbegin_or_lock_irqsave(), and validated * by need_seqretry(). */ static inline void done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) { if (seq & 1) read_sequnlock_excl_irqrestore(lock, flags); } #endif /* __LINUX_SEQLOCK_H */
1084 1085 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding * LAPB 002 Jonathan Naylor New timer architecture. * 2000-10-29 Henner Eisen lapb_data_indication() return status. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/stat.h> #include <linux/init.h> #include <net/lapb.h> static LIST_HEAD(lapb_list); static DEFINE_RWLOCK(lapb_list_lock); /* * Free an allocated lapb control block. */ static void lapb_free_cb(struct lapb_cb *lapb) { kfree(lapb); } static __inline__ void lapb_hold(struct lapb_cb *lapb) { refcount_inc(&lapb->refcnt); } static __inline__ void lapb_put(struct lapb_cb *lapb) { if (refcount_dec_and_test(&lapb->refcnt)) lapb_free_cb(lapb); } /* * Socket removal during an interrupt is now safe. */ static void __lapb_remove_cb(struct lapb_cb *lapb) { if (lapb->node.next) { list_del(&lapb->node); lapb_put(lapb); } } /* * Add a socket to the bound sockets list. */ static void __lapb_insert_cb(struct lapb_cb *lapb) { list_add(&lapb->node, &lapb_list); lapb_hold(lapb); } static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) { struct lapb_cb *lapb, *use = NULL; list_for_each_entry(lapb, &lapb_list, node) { if (lapb->dev == dev) { use = lapb; break; } } if (use) lapb_hold(use); return use; } static struct lapb_cb *lapb_devtostruct(struct net_device *dev) { struct lapb_cb *rc; read_lock_bh(&lapb_list_lock); rc = __lapb_devtostruct(dev); read_unlock_bh(&lapb_list_lock); return rc; } /* * Create an empty LAPB control block. */ static struct lapb_cb *lapb_create_cb(void) { struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); if (!lapb) goto out; skb_queue_head_init(&lapb->write_queue); skb_queue_head_init(&lapb->ack_queue); timer_setup(&lapb->t1timer, NULL, 0); timer_setup(&lapb->t2timer, NULL, 0); lapb->t1timer_running = false; lapb->t2timer_running = false; lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; lapb->n2 = LAPB_DEFAULT_N2; lapb->mode = LAPB_DEFAULT_MODE; lapb->window = LAPB_DEFAULT_WINDOW; lapb->state = LAPB_STATE_0; spin_lock_init(&lapb->lock); refcount_set(&lapb->refcnt, 1); out: return lapb; } int lapb_register(struct net_device *dev, const struct lapb_register_struct *callbacks) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (lapb) { lapb_put(lapb); goto out; } lapb = lapb_create_cb(); rc = LAPB_NOMEM; if (!lapb) goto out; lapb->dev = dev; lapb->callbacks = callbacks; __lapb_insert_cb(lapb); lapb_start_t1timer(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_register); int lapb_unregister(struct net_device *dev) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (!lapb) goto out; lapb_put(lapb); /* Wait for other refs to "lapb" to drop */ while (refcount_read(&lapb->refcnt) > 2) usleep_range(1, 10); spin_lock_bh(&lapb->lock); lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); lapb_clear_queues(lapb); spin_unlock_bh(&lapb->lock); /* Wait for running timers to stop */ del_timer_sync(&lapb->t1timer); del_timer_sync(&lapb->t2timer); __lapb_remove_cb(lapb); lapb_put(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_unregister); int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); parms->t1 = lapb->t1 / HZ; parms->t2 = lapb->t2 / HZ; parms->n2 = lapb->n2; parms->n2count = lapb->n2count; parms->state = lapb->state; parms->window = lapb->window; parms->mode = lapb->mode; if (!timer_pending(&lapb->t1timer)) parms->t1timer = 0; else parms->t1timer = (lapb->t1timer.expires - jiffies) / HZ; if (!timer_pending(&lapb->t2timer)) parms->t2timer = 0; else parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ; spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; out: return rc; } EXPORT_SYMBOL(lapb_getparms); int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_INVALUE; if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1) goto out_put; if (lapb->state == LAPB_STATE_0) { if (parms->mode & LAPB_EXTENDED) { if (parms->window < 1 || parms->window > 127) goto out_put; } else { if (parms->window < 1 || parms->window > 7) goto out_put; } lapb->mode = parms->mode; lapb->window = parms->window; } lapb->t1 = parms->t1 * HZ; lapb->t2 = parms->t2 * HZ; lapb->n2 = parms->n2; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_setparms); int lapb_connect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_OK; if (lapb->state == LAPB_STATE_1) goto out_put; rc = LAPB_CONNECTED; if (lapb->state == LAPB_STATE_3 || lapb->state == LAPB_STATE_4) goto out_put; lapb_establish_data_link(lapb); lapb_dbg(0, "(%p) S0 -> S1\n", lapb->dev); lapb->state = LAPB_STATE_1; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_connect_request); static int __lapb_disconnect_request(struct lapb_cb *lapb) { switch (lapb->state) { case LAPB_STATE_0: return LAPB_NOTCONNECTED; case LAPB_STATE_1: lapb_dbg(1, "(%p) S1 TX DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb->state = LAPB_STATE_0; lapb_start_t1timer(lapb); return LAPB_NOTCONNECTED; case LAPB_STATE_2: return LAPB_OK; } lapb_clear_queues(lapb); lapb->n2count = 0; lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb_start_t1timer(lapb); lapb_stop_t2timer(lapb); lapb->state = LAPB_STATE_2; lapb_dbg(1, "(%p) S3 DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S3 -> S2\n", lapb->dev); return LAPB_OK; } int lapb_disconnect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = __lapb_disconnect_request(lapb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_disconnect_request); int lapb_data_request(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_NOTCONNECTED; if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4) goto out_put; skb_queue_tail(&lapb->write_queue, skb); lapb_kick(lapb); rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_data_request); int lapb_data_received(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (lapb) { spin_lock_bh(&lapb->lock); lapb_data_input(lapb, skb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; } return rc; } EXPORT_SYMBOL(lapb_data_received); void lapb_connect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_confirmation) lapb->callbacks->connect_confirmation(lapb->dev, reason); } void lapb_connect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_indication) lapb->callbacks->connect_indication(lapb->dev, reason); } void lapb_disconnect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_confirmation) lapb->callbacks->disconnect_confirmation(lapb->dev, reason); } void lapb_disconnect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_indication) lapb->callbacks->disconnect_indication(lapb->dev, reason); } int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb) { if (lapb->callbacks->data_indication) return lapb->callbacks->data_indication(lapb->dev, skb); kfree_skb(skb); return NET_RX_SUCCESS; /* For now; must be != NET_RX_DROP */ } int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) { int used = 0; if (lapb->callbacks->data_transmit) { lapb->callbacks->data_transmit(lapb->dev, skb); used = 1; } return used; } /* Handle device status changes. */ static int lapb_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct lapb_cb *lapb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type != ARPHRD_X25) return NOTIFY_DONE; lapb = lapb_devtostruct(dev); if (!lapb) return NOTIFY_DONE; spin_lock_bh(&lapb->lock); switch (event) { case NETDEV_UP: lapb_dbg(0, "(%p) Interface up: %s\n", dev, dev->name); if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier is already up: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } break; case NETDEV_GOING_DOWN: if (netif_carrier_ok(dev)) __lapb_disconnect_request(lapb); break; case NETDEV_DOWN: lapb_dbg(0, "(%p) Interface down: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); break; case NETDEV_CHANGE: if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier detected: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } else { lapb_dbg(0, "(%p) Carrier lost: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); } break; } spin_unlock_bh(&lapb->lock); lapb_put(lapb); return NOTIFY_DONE; } static struct notifier_block lapb_dev_notifier = { .notifier_call = lapb_device_event, }; static int __init lapb_init(void) { return register_netdevice_notifier(&lapb_dev_notifier); } static void __exit lapb_exit(void) { WARN_ON(!list_empty(&lapb_list)); unregister_netdevice_notifier(&lapb_dev_notifier); } MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Link Access Procedure B link layer protocol"); MODULE_LICENSE("GPL"); module_init(lapb_init); module_exit(lapb_exit);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Manage send buffer. * Producer: * Copy user space data into send buffer, if send buffer space available. * Consumer: * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available. * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/tcp.h> #include "smc.h" #include "smc_wr.h" #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" #include "smc_tx.h" #include "smc_stats.h" #define SMC_TX_WORK_DELAY 0 #define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() * to wakeup sndbuf producers that blocked with smc_tx_wait(). * called under sk_socket lock. */ static void smc_tx_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; struct smc_sock *smc = smc_sk(sk); struct socket_wq *wq; /* similar to sk_stream_write_space */ if (atomic_read(&smc->conn.sndbuf_space) && sock) { if (test_bit(SOCK_NOSPACE, &sock->flags)) SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } /* Wakeup sndbuf producers that blocked with smc_tx_wait(). * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). */ void smc_tx_sndbuf_nonfull(struct smc_sock *smc) { if (smc->sk.sk_socket && test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags)) smc->sk.sk_write_space(&smc->sk); } /* blocks sndbuf producer until at least one byte of free space available * or urgent Byte was consumed */ static int smc_tx_wait(struct smc_sock *smc, int flags) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; long timeo; int rc = 0; /* similar to sk_stream_wait_memory */ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); add_wait_queue(sk_sleep(sk), &wait); while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || conn->killed || conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { rc = -EPIPE; break; } if (smc_cdc_rxed_any_close(conn)) { rc = -ECONNRESET; break; } if (!timeo) { /* ensure EPOLLOUT is subsequently generated */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); rc = -EAGAIN; break; } if (signal_pending(current)) { rc = sock_intr_errno(timeo); break; } sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend) break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend), &wait); } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static bool smc_tx_is_corked(struct smc_sock *smc) { struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } /* sndbuf producer: main API called by socket layer. * called under sock lock. */ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) { size_t copylen, send_done = 0, send_remaining = len; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; union smc_host_cursor prep; struct sock *sk = &smc->sk; char *sndbuf_base; int tx_cnt_prep; int writespace; int rc, chunk; /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { rc = -EPIPE; goto out_err; } if (sk->sk_state == SMC_INIT) return -ENOTCONN; if (len > conn->sndbuf_desc->len) SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); if (len > conn->peer_rmbe_size) SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); if (msg->msg_flags & MSG_OOB) SMC_STAT_INC(smc, urg_data_cnt); while (msg_data_left(msg)) { if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || conn->killed) return -EPIPE; if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; if (msg->msg_flags & MSG_OOB) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { if (send_done) return send_done; rc = smc_tx_wait(smc, msg->msg_flags); if (rc) goto out_err; continue; } /* initialize variables for 1st iteration of subsequent loop */ /* could be just 1 byte, even after smc_tx_wait above */ writespace = atomic_read(&conn->sndbuf_space); /* not more than what user space asked for */ copylen = min_t(size_t, send_remaining, writespace); /* determine start of sndbuf */ sndbuf_base = conn->sndbuf_desc->cpu_addr; smc_curs_copy(&prep, &conn->tx_curs_prep, conn); tx_cnt_prep = prep.count; /* determine chunks where to write into sndbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len - tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; smc_sndbuf_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { rc = memcpy_from_msg(sndbuf_base + chunk_off, msg, chunk_len); if (rc) { smc_sndbuf_sync_sg_for_device(conn); if (send_done) return send_done; goto out_err; } send_done += chunk_len; send_remaining -= chunk_len; if (chunk_len_sum == copylen) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ chunk_len = copylen - chunk_len; /* remainder */ chunk_len_sum += chunk_len; chunk_off = 0; /* modulo offset in send ring buffer */ } smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); smc_curs_copy(&conn->tx_curs_prep, &prep, conn); /* increased in send tasklet smc_cdc_tx_handler() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->sndbuf_space); /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); /* since we just produced more new data into sndbuf, * trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && (atomic_read(&conn->sndbuf_space) > (conn->sndbuf_desc->len >> 1))) /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_CORK_DELAY); else smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ return send_done; out_err: rc = sk_stream_error(sk, msg->msg_flags, rc); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(rc == -EAGAIN)) sk->sk_write_space(sk); return rc; } /***************************** sndbuf consumer *******************************/ /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal) { struct smc_ism_position pos; int rc; memset(&pos, 0, sizeof(pos)); pos.token = conn->peer_token; pos.index = conn->peer_rmbe_idx; pos.offset = conn->tx_off + offset; pos.signal = signal; rc = smc_ism_write(conn->lgr->smcd, &pos, data, len); if (rc) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return rc; } /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; struct smc_link *link = conn->lnk; int rc; rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; rdma_wr->remote_addr = lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); return rc; } /* sndbuf consumer */ static inline void smc_tx_advance_cursors(struct smc_connection *conn, union smc_host_cursor *prod, union smc_host_cursor *sent, size_t len) { smc_curs_add(conn->peer_rmbe_size, prod, len); /* increased in recv tasklet smc_cdc_msg_rcv() */ smp_mb__before_atomic(); /* data in flight reduces usable snd_wnd */ atomic_sub(len, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); smc_curs_add(conn->sndbuf_desc->len, sent, len); } /* SMC-R helper for smc_tx_rdma_writes() */ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t src_off, size_t src_len, size_t dst_off, size_t dst_len, struct smc_rdma_wr *wr_rdma_buf) { struct smc_link *link = conn->lnk; dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; int num_sges; int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { struct ib_sge *sge = wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list; num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { sge[srcchunk].addr = dma_addr + src_off; sge[srcchunk].length = src_len; num_sges++; src_off += src_len; if (src_off >= conn->sndbuf_desc->len) src_off -= conn->sndbuf_desc->len; /* modulo in send ring */ if (src_len_sum == dst_len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } rc = smc_tx_rdma_write(conn, dst_off, num_sges, &wr_rdma_buf->wr_tx_rdma[dstchunk]); if (rc) return rc; if (dst_len_sum == len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ dst_off = 0; /* modulo offset in RMBE ring buffer */ dst_len = len - dst_len; /* remainder */ dst_len_sum += dst_len; src_len = min_t(int, dst_len, conn->sndbuf_desc->len - sent_count); src_len_sum = src_len; } return 0; } /* SMC-D helper for smc_tx_rdma_writes() */ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t src_off, size_t src_len, size_t dst_off, size_t dst_len) { int src_len_sum = src_len, dst_len_sum = dst_len; int srcchunk, dstchunk; int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { for (srcchunk = 0; srcchunk < 2; srcchunk++) { void *data = conn->sndbuf_desc->cpu_addr + src_off; rc = smcd_tx_ism_write(conn, data, src_len, dst_off + sizeof(struct smcd_cdc_msg), 0); if (rc) return rc; dst_off += src_len; src_off += src_len; if (src_off >= conn->sndbuf_desc->len) src_off -= conn->sndbuf_desc->len; /* modulo in send ring */ if (src_len_sum == dst_len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } if (dst_len_sum == len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ dst_off = 0; /* modulo offset in RMBE ring buffer */ dst_len = len - dst_len; /* remainder */ dst_len_sum += dst_len; src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off); src_len_sum = src_len; } return 0; } /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; * usable snd_wnd as max transmit */ static int smc_tx_rdma_writes(struct smc_connection *conn, struct smc_rdma_wr *wr_rdma_buf) { size_t len, src_len, dst_off, dst_len; /* current chunk values */ union smc_host_cursor sent, prep, prod, cons; struct smc_cdc_producer_flags *pflags; int to_send, rmbespace; int rc; /* source: sndbuf */ smc_curs_copy(&sent, &conn->tx_curs_sent, conn); smc_curs_copy(&prep, &conn->tx_curs_prep, conn); /* cf. wmem_alloc - (snd_max - snd_una) */ to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); if (to_send <= 0) return 0; /* destination: RMBE */ /* cf. snd_wnd */ rmbespace = atomic_read(&conn->peer_rmbe_space); if (rmbespace <= 0) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); return 0; } smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); /* if usable snd_wnd closes ask peer to advertise once it opens again */ pflags = &conn->local_tx_ctrl.prod_flags; pflags->write_blocked = (to_send >= rmbespace); /* cf. usable snd_wnd */ len = min(to_send, rmbespace); /* initialize variables for first iteration of subsequent nested loop */ dst_off = prod.count; if (prod.wrap == cons.wrap) { /* the filled destination area is unwrapped, * hence the available free destination space is wrapped * and we need 2 destination chunks of sum len; start with 1st * which is limited by what's available in sndbuf */ dst_len = min_t(size_t, conn->peer_rmbe_size - prod.count, len); } else { /* the filled destination area is wrapped, * hence the available free destination space is unwrapped * and we need a single destination chunk of entire len */ dst_len = len; } /* dst_len determines the maximum src_len */ if (sent.count + dst_len <= conn->sndbuf_desc->len) { /* unwrapped src case: single chunk of entire dst_len */ src_len = dst_len; } else { /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ src_len = conn->sndbuf_desc->len - sent.count; } if (conn->lgr->is_smcd) rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len, dst_off, dst_len); else rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len, dst_off, dst_len, wr_rdma_buf); if (rc) return rc; if (conn->urg_tx_pend && len == to_send) pflags->urg_data_present = 1; smc_tx_advance_cursors(conn, &prod, &sent, len); /* update connection's cursors with advanced local cursors */ smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn); /* dst: peer RMBE */ smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */ return 0; } /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; struct smc_link *link = conn->lnk; struct smc_rdma_wr *wr_rdma_buf; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; if (!link || !smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { smc_wr_tx_link_put(link); if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); if (smc->sk.sk_err == ECONNABORTED) return sock_error(&smc->sk); if (conn->killed) return -EPIPE; rc = 0; mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_WORK_DELAY); } return rc; } spin_lock_bh(&conn->send_lock); if (link != conn->lnk) { /* link of connection changed, tx_work will restart */ smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); rc = -ENOLINK; goto out_unlock; } if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } } rc = smc_cdc_msg_send(conn, wr_buf, pend); if (!rc && pflags->urg_data_present) { pflags->urg_data_pending = 0; pflags->urg_data_present = 0; } out_unlock: spin_unlock_bh(&conn->send_lock); smc_wr_tx_link_put(link); return rc; } static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; int rc = 0; spin_lock_bh(&conn->send_lock); if (!pflags->urg_data_present) rc = smc_tx_rdma_writes(conn, NULL); if (!rc) rc = smcd_cdc_msg_send(conn); if (!rc && pflags->urg_data_present) { pflags->urg_data_pending = 0; pflags->urg_data_present = 0; } spin_unlock_bh(&conn->send_lock); return rc; } int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { int rc; if (conn->killed || conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) return -EPIPE; /* connection being aborted */ if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else rc = smcr_tx_sndbuf_nonempty(conn); if (!rc) { /* trigger socket release if connection is closing */ struct smc_sock *smc = container_of(conn, struct smc_sock, conn); smc_close_wake_tx_prepared(smc); } return rc; } void smc_tx_pending(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc; if (smc->sk.sk_err) return; rc = smc_tx_sndbuf_nonempty(conn); if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; } /* Wakeup sndbuf consumers from process context * since there is more data to transmit */ void smc_tx_work(struct work_struct *work) { struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); lock_sock(&smc->sk); smc_tx_pending(conn); release_sock(&smc->sk); } void smc_tx_consumer_update(struct smc_connection *conn, bool force) { union smc_host_cursor cfed, cons, prod; int sender_free = conn->rmb_desc->len; int to_confirm; smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn); to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); if (to_confirm > conn->rmbe_update_limit) { smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn); sender_free = conn->rmb_desc->len - smc_curs_diff_large(conn->rmb_desc->len, &cfed, &prod); } if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || force || ((to_confirm > conn->rmbe_update_limit) && ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { if (conn->killed || conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && !conn->killed) { queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_WORK_DELAY); return; } } if (conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; } /***************************** send initialize *******************************/ /* Initialize send properties on connection establishment. NB: not __init! */ void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; }
90 91 91 14 96 97 67 91 91 91 91 91 605 564 564 67 564 91 89 75 91 91 91 91 3 91 1 91 74 91 5 91 78 10 77 77 34 5 63 63 10 10 60 63 63 63 8 63 63 63 11 39 63 39 63 11 63 63 63 63 61 49 10 50 44 45 45 58 63 63 63 63 63 63 63 63 63 63 61 91 63 63 2 63 63 39 63 60 60 60 60 60 60 57 91 91 91 91 91 91 91 91 63 63 63 1 91 91 91 91 81 91 60 91 55 91 61 91 91 91 39 91 8 91 9 91 44 91 91 91 91 91 91 91 3 91 91 91 91 91 91 44 55 90 91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). * Multiqueue VM started 5.8.00, Rik van Riel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/module.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/vmpressure.h> #include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> /* for try_to_release_page(), buffer_heads_over_limit */ #include <linux/mm_inline.h> #include <linux/backing-dev.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> #include <linux/rwsem.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> #include <linux/migrate.h> #include <linux/delayacct.h> #include <linux/sysctl.h> #include <linux/oom.h> #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> #include <linux/psi.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include <linux/swapops.h> #include <linux/balloon_compaction.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes * are scanned. */ nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation. */ struct mem_cgroup *target_mem_cgroup; /* * Scan pressure balancing between anon and file LRUs */ unsigned long anon_cost; unsigned long file_cost; /* Can active pages be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 unsigned int may_deactivate:2; unsigned int force_deactivate:1; unsigned int skipped_deactivate:1; /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; /* Can mapped pages be reclaimed? */ unsigned int may_unmap:1; /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; /* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at * reduced force or passed over entirely due to its memory.low * setting (memcg_low_skipped), and nothing is reclaimed as a * result, then go back for one more cycle that reclaims the protected * memory (memcg_low_reclaim) to avert OOM. */ unsigned int memcg_low_reclaim:1; unsigned int memcg_low_skipped:1; unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; /* There is easily reclaimable cold cache in the current node */ unsigned int cache_trim_mode:1; /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; /* Allocation order */ s8 order; /* Scan (total_size >> priority) pages at once */ s8 priority; /* The highest zone to isolate pages for reclaim from */ s8 reclaim_idx; /* This context's GFP mask */ gfp_t gfp_mask; /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; struct { unsigned int dirty; unsigned int unqueued_dirty; unsigned int congested; unsigned int writeback; unsigned int immediate; unsigned int file_taken; unsigned int taken; } nr; /* for recording the reclaimed slab by now */ struct reclaim_state reclaim_state; }; #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_page(_page, _base, _field) \ do { \ if ((_page)->lru.prev != _base) { \ struct page *prev; \ \ prev = lru_to_page(&(_page->lru)); \ prefetchw(&prev->_field); \ } \ } while (0) #else #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) #endif /* * From 0 .. 200. Higher means more swappy. */ int vm_swappiness = 60; static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { /* Check for an overwrite */ WARN_ON_ONCE(rs && task->reclaim_state); /* Check for the nulling of an already-nulled member */ WARN_ON_ONCE(!rs && !task->reclaim_state); task->reclaim_state = rs; } static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; /* The shrinker_info is expanded in a batch of BITS_PER_LONG */ static inline int shrinker_map_size(int nr_items) { return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); } static inline int shrinker_defer_size(int nr_items) { return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); } static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, lockdep_is_held(&shrinker_rwsem)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, int map_size, int defer_size, int old_map_size, int old_defer_size) { struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; int size = map_size + defer_size; for_each_node(nid) { pn = memcg->nodeinfo[nid]; old = shrinker_info_protected(memcg, nid); /* Not yet online memcg */ if (!old) return 0; new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); if (!new) return -ENOMEM; new->nr_deferred = (atomic_long_t *)(new + 1); new->map = (void *)new->nr_deferred + defer_size; /* map: set all old bits, clear all new bits */ memset(new->map, (int)0xff, old_map_size); memset((void *)new->map + old_map_size, 0, map_size - old_map_size); /* nr_deferred: copy old values, clear all new values */ memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); memset((void *)new->nr_deferred + old_defer_size, 0, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); } return 0; } void free_shrinker_info(struct mem_cgroup *memcg) { struct mem_cgroup_per_node *pn; struct shrinker_info *info; int nid; for_each_node(nid) { pn = memcg->nodeinfo[nid]; info = rcu_dereference_protected(pn->shrinker_info, true); kvfree(info); rcu_assign_pointer(pn->shrinker_info, NULL); } } int alloc_shrinker_info(struct mem_cgroup *memcg) { struct shrinker_info *info; int nid, size, ret = 0; int map_size, defer_size = 0; down_write(&shrinker_rwsem); map_size = shrinker_map_size(shrinker_nr_max); defer_size = shrinker_defer_size(shrinker_nr_max); size = map_size + defer_size; for_each_node(nid) { info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); if (!info) { free_shrinker_info(memcg); ret = -ENOMEM; break; } info->nr_deferred = (atomic_long_t *)(info + 1); info->map = (void *)info->nr_deferred + defer_size; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem); return ret; } static inline bool need_expand(int nr_max) { return round_up(nr_max, BITS_PER_LONG) > round_up(shrinker_nr_max, BITS_PER_LONG); } static int expand_shrinker_info(int new_id) { int ret = 0; int new_nr_max = new_id + 1; int map_size, defer_size = 0; int old_map_size, old_defer_size = 0; struct mem_cgroup *memcg; if (!need_expand(new_nr_max)) goto out; if (!root_mem_cgroup) goto out; lockdep_assert_held(&shrinker_rwsem); map_size = shrinker_map_size(new_nr_max); defer_size = shrinker_defer_size(new_nr_max); old_map_size = shrinker_map_size(shrinker_nr_max); old_defer_size = shrinker_defer_size(shrinker_nr_max); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { ret = expand_one_shrinker_info(memcg, map_size, defer_size, old_map_size, old_defer_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out; } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); out: if (!ret) shrinker_nr_max = new_nr_max; return ret; } void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); rcu_read_unlock(); } } static DEFINE_IDR(shrinker_idr); static int prealloc_memcg_shrinker(struct shrinker *shrinker) { int id, ret = -ENOMEM; if (mem_cgroup_disabled()) return -ENOSYS; down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; if (id >= shrinker_nr_max) { if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); goto unlock; } } shrinker->id = id; ret = 0; unlock: up_write(&shrinker_rwsem); return ret; } static void unregister_memcg_shrinker(struct shrinker *shrinker) { int id = shrinker->id; BUG_ON(id < 0); lockdep_assert_held(&shrinker_rwsem); idr_remove(&shrinker_idr, id); } static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; info = shrinker_info_protected(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; info = shrinker_info_protected(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } void reparent_shrinker_deferred(struct mem_cgroup *memcg) { int i, nid; long nr; struct mem_cgroup *parent; struct shrinker_info *child_info, *parent_info; parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ down_read(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); for (i = 0; i < shrinker_nr_max; i++) { nr = atomic_long_read(&child_info->nr_deferred[i]); atomic_long_add(nr, &parent_info->nr_deferred[i]); } } up_read(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; } /** * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in * shrink_page_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ static bool writeback_throttling_sane(struct scan_control *sc) { if (!cgroup_reclaim(sc)) return true; #ifdef CONFIG_CGROUP_WRITEBACK if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return true; #endif return false; } #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { return -ENOSYS; } static void unregister_memcg_shrinker(struct shrinker *shrinker) { } static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { return 0; } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { return 0; } static bool cgroup_reclaim(struct scan_control *sc) { return false; } static bool writeback_throttling_sane(struct scan_control *sc) { return true; } #endif static long xchg_nr_deferred(struct shrinker *shrinker, struct shrink_control *sc) { int nid = sc->nid; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) nid = 0; if (sc->memcg && (shrinker->flags & SHRINKER_MEMCG_AWARE)) return xchg_nr_deferred_memcg(nid, shrinker, sc->memcg); return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); } static long add_nr_deferred(long nr, struct shrinker *shrinker, struct shrink_control *sc) { int nid = sc->nid; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) nid = 0; if (sc->memcg && (shrinker->flags & SHRINKER_MEMCG_AWARE)) return add_nr_deferred_memcg(nr, nid, shrinker, sc->memcg); return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); } static bool can_demote(int nid, struct scan_control *sc) { if (!numa_demotion_enabled) return false; if (sc) { if (sc->no_demotion) return false; /* It is pointless to do demotion in memcg reclaim */ if (cgroup_reclaim(sc)) return false; } if (next_demotion_node(nid) == NUMA_NO_NODE) return false; return true; } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid, struct scan_control *sc) { if (memcg == NULL) { /* * For non-memcg reclaim, is there * space in any swap device? */ if (get_nr_swap_pages() > 0) return true; } else { /* Is the memcg below its swap limit? */ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) return true; } /* * The page can not be swapped. * * Can it be reclaimed from this node via demotion? */ return can_demote(nid, sc); } /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is * not expected that isolated pages will be a dominating factor. */ unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); return nr; } /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) */ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { unsigned long size = 0; int zid; for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; if (!managed_zone(zone)) continue; if (!mem_cgroup_disabled()) size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); } return size; } /* * Add a shrinker callback to be called from the vm. */ int prealloc_shrinker(struct shrinker *shrinker) { unsigned int size; int err; if (shrinker->flags & SHRINKER_MEMCG_AWARE) { err = prealloc_memcg_shrinker(shrinker); if (err != -ENOSYS) return err; shrinker->flags &= ~SHRINKER_MEMCG_AWARE; } size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) return -ENOMEM; return 0; } void free_prealloced_shrinker(struct shrinker *shrinker) { if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); return; } kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; up_write(&shrinker_rwsem); } int register_shrinker(struct shrinker *shrinker) { int err = prealloc_shrinker(shrinker); if (err) return err; register_shrinker_prepared(shrinker); return 0; } EXPORT_SYMBOL(register_shrinker); /* * Remove one */ void unregister_shrinker(struct shrinker *shrinker) { if (!(shrinker->flags & SHRINKER_REGISTERED)) return; down_write(&shrinker_rwsem); list_del(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, struct shrinker *shrinker, int priority) { unsigned long freed = 0; unsigned long long delta; long total_scan; long freeable; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; /* * copy the current shrinker scan count into a local variable * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ nr = xchg_nr_deferred(shrinker, shrinkctl); if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; do_div(delta, shrinker->seeks); } else { /* * These objects don't require any IO to create. Trim * them aggressively under memory pressure to keep * them from causing refetches in the IO caches. */ delta = freeable / 2; } total_scan = nr >> priority; total_scan += delta; total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); /* * Normally, we should not scan less than batch_size objects in one * pass to avoid too frequent shrinker calls, but if the slab has less * than batch_size objects in total and we are really tight on memory, * we will try to reclaim all available objects, otherwise we can end * up failing allocations although there are plenty of reclaimable * objects spread over several slabs with usage less than the * batch_size. * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); shrinkctl->nr_to_scan = nr_to_scan; shrinkctl->nr_scanned = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); total_scan -= shrinkctl->nr_scanned; scanned += shrinkctl->nr_scanned; cond_resched(); } /* * The deferred work is increased by any new work (delta) that wasn't * done, decreased by old deferred work that was done now. * * And it is capped to two times of the freeable items. */ next_deferred = max_t(long, (nr + delta - scanned), 0); next_deferred = min(next_deferred, (2 * freeable)); /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. */ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; } #ifdef CONFIG_MEMCG static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { struct shrinker_info *info; unsigned long ret, freed = 0; int i; if (!mem_cgroup_online(memcg)) return 0; if (!down_read_trylock(&shrinker_rwsem)) return 0; info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; for_each_set_bit(i, info->map, shrinker_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; struct shrinker *shrinker; shrinker = idr_find(&shrinker_idr, i); if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { if (!shrinker) clear_bit(i, info->map); continue; } /* Call non-slab shrinkers even though kmem is disabled */ if (!memcg_kmem_enabled() && !(shrinker->flags & SHRINKER_NONSLAB)) continue; ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { clear_bit(i, info->map); /* * After the shrinker reported that it had no objects to * free, but before we cleared the corresponding bit in * the memcg shrinker map, a new object might have been * added. To make sure, we have the bit set in this * case, we invoke the shrinker one more time and reset * the bit if it reports that it is not empty anymore. * The memory barrier here pairs with the barrier in * set_shrinker_bit(): * * list_lru_add() shrink_slab_memcg() * list_add_tail() clear_bit() * <MB> <MB> * set_bit() do_shrink_slab() */ smp_mb__after_atomic(); ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; else set_shrinker_bit(memcg, nid, i); } freed += ret; if (rwsem_is_contended(&shrinker_rwsem)) { freed = freed ? : 1; break; } } unlock: up_read(&shrinker_rwsem); return freed; } #else /* CONFIG_MEMCG */ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { return 0; } #endif /* CONFIG_MEMCG */ /** * shrink_slab - shrink slab caches * @gfp_mask: allocation context * @nid: node whose slab caches to target * @memcg: memory cgroup whose slab caches to target * @priority: the reclaim priority * * Call the shrink functions to age shrinkable caches. * * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * * @memcg specifies the memory cgroup to target. Unaware shrinkers * are called only if it is the root cgroup. * * @priority is sc->priority, we take the number of objects and >> by priority * in order to get the scan target. * * Returns the number of reclaimed slab objects. */ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { unsigned long ret, freed = 0; struct shrinker *shrinker; /* * The root memcg might be allocated even though memcg is disabled * via "cgroup_disable=memory" boot parameter. This could make * mem_cgroup_is_root() return false, then just run memcg slab * shrink, but skip global shrink. This may result in premature * oom. */ if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); if (!down_read_trylock(&shrinker_rwsem)) goto out; list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; /* * Bail out if someone want to register a new shrinker to * prevent the registration from being stalled for long periods * by parallel ongoing shrinking. */ if (rwsem_is_contended(&shrinker_rwsem)) { freed = freed ? : 1; break; } } up_read(&shrinker_rwsem); out: cond_resched(); return freed; } void drop_slab_node(int nid) { unsigned long freed; int shift = 0; do { struct mem_cgroup *memcg = NULL; if (fatal_signal_pending(current)) return; freed = 0; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); } while ((freed >> shift++) > 1); } void drop_slab(void) { int nid; for_each_online_node(nid) drop_slab_node(nid); } static inline int is_page_cache_freeable(struct page *page) { /* * A freeable page cache page is referenced only by the caller * that isolated the page, the page cache and optional buffer * heads at page->private. */ int page_cache_pins = thp_nr_pages(page); return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } static int may_write_to_inode(struct inode *inode) { if (current->flags & PF_SWAPWRITE) return 1; if (!inode_write_congested(inode)) return 1; if (inode_to_bdi(inode) == current->backing_dev_info) return 1; return 0; } /* * We detected a synchronous write error writing a page out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing * prevents it from being freed up. But we have a ref on the page and once * that page is locked, the mapping is pinned. * * We're allowed to run sleeping lock_page() here because we know the caller has * __GFP_FS. */ static void handle_write_error(struct address_space *mapping, struct page *page, int error) { lock_page(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); } /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ PAGE_KEEP, /* move page to the active list, page is locked */ PAGE_ACTIVATE, /* page has been sent to the disk successfully, page is unlocked */ PAGE_SUCCESS, /* page is clean and locked */ PAGE_CLEAN, } pageout_t; /* * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write * will be non-blocking. To prevent this allocation from being * stalled by pagecache activity. But note that there may be * stalls if we need to run get_block(). We could test * PagePrivate for that. * * If this process is currently in __generic_file_write_iter() against * this page's queue, we can perform writeback even if that * will block. * * If the page is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; if (!mapping) { /* * Some data journaling orphaned pages can have * page->mapping == NULL while being dirty with clean buffers. */ if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } return PAGE_KEEP; } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; if (!may_write_to_inode(mapping->host)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { int res; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, .for_reclaim = 1, }; SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); if (res < 0) handle_write_error(mapping, page, res); if (res == AOP_WRITEPAGE_ACTIVATE) { ClearPageReclaim(page); return PAGE_ACTIVATE; } if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } trace_mm_vmscan_writepage(page); inc_node_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } return PAGE_CLEAN; } /* * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed, struct mem_cgroup *target_memcg) { int refcount; void *shadow = NULL; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy page. * * Must be careful with the order of the tests. When someone has * a ref to the page, it may be possible that they dirty it then * drop the reference. So if PageDirty is tested before page_count * here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); * !PageDirty(page) [good] * SetPageDirty(page); * put_page(page); * !page_count(page) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the page->flags * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under the i_pages lock, then this ordering is not required. */ refcount = 1 + compound_nr(page); if (!page_ref_freeze(page, refcount)) goto cannot_free; /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ if (unlikely(PageDirty(page))) { page_ref_unfreeze(page, refcount); goto cannot_free; } if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_swap_cache(page, swap, shadow); xa_unlock_irq(&mapping->i_pages); put_swap_page(page, swap); } else { void (*freepage)(struct page *); freepage = mapping->a_ops->freepage; /* * Remember a shadow entry for reclaimed file cache in * order to detect refaults, thus thrashing, later on. * * But don't store shadows in an address space that is * already exiting. This is not just an optimization, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. * * We also don't store shadows for DAX mappings because the * only page cache pages found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same address_space. */ if (reclaimed && page_is_file_lru(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); xa_unlock_irq(&mapping->i_pages); if (freepage != NULL) freepage(page); } return 1; cannot_free: xa_unlock_irq(&mapping->i_pages); return 0; } /* * Attempt to detach a locked page from its ->mapping. If it is dirty or if * someone else has a ref on the page, abort and return 0. If it was * successfully detached, return 1. Assumes the caller has a single ref on * this page. */ int remove_mapping(struct address_space *mapping, struct page *page) { if (__remove_mapping(mapping, page, false, NULL)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another * atomic operation. */ page_ref_unfreeze(page, 1); return 1; } return 0; } /** * putback_lru_page - put previously isolated page onto appropriate LRU list * @page: page to be put back to appropriate lru list * * Add previously isolated @page to appropriate LRU list. * Page may still be unevictable for other reasons. * * lru_lock must not be held, interrupts must be enabled. */ void putback_lru_page(struct page *page) { lru_cache_add(page); put_page(page); /* drop ref from isolate */ } enum page_references { PAGEREF_RECLAIM, PAGEREF_RECLAIM_CLEAN, PAGEREF_KEEP, PAGEREF_ACTIVATE, }; static enum page_references page_check_references(struct page *page, struct scan_control *sc) { int referenced_ptes, referenced_page; unsigned long vm_flags; referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, &vm_flags); referenced_page = TestClearPageReferenced(page); /* * Mlock lost the isolation race with us. Let try_to_unmap() * move the page to the unevictable list. */ if (vm_flags & VM_LOCKED) return PAGEREF_RECLAIM; if (referenced_ptes) { /* * All mapped pages start out with page table * references from the instantiating fault, so we need * to look twice if a mapped file page is used more * than once. * * Mark it and spare it for another trip around the * inactive list. Another page table reference will * lead to its activation. * * Note: the mark is set for activated pages as well * so that recently deactivated but used pages are * quickly recovered. */ SetPageReferenced(page); if (referenced_page || referenced_ptes > 1) return PAGEREF_ACTIVATE; /* * Activate file-backed executable pages after first usage. */ if ((vm_flags & VM_EXEC) && !PageSwapBacked(page)) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; } /* Reclaim if clean, defer dirty pages to writeback */ if (referenced_page && !PageSwapBacked(page)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; } /* Check if a page is dirty or under writeback */ static void page_check_dirty_writeback(struct page *page, bool *dirty, bool *writeback) { struct address_space *mapping; /* * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ if (!page_is_file_lru(page) || (PageAnon(page) && !PageSwapBacked(page))) { *dirty = false; *writeback = false; return; } /* By default assume that the page flags are accurate */ *dirty = PageDirty(page); *writeback = PageWriteback(page); /* Verify dirty/writeback state if the filesystem supports it */ if (!page_has_private(page)) return; mapping = page_mapping(page); if (mapping && mapping->a_ops->is_dirty_writeback) mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } static struct page *alloc_demote_page(struct page *page, unsigned long node) { struct migration_target_control mtc = { /* * Allocate from 'node', or fail quickly and quietly. * When this happens, 'page' will likely just be discarded * instead of migrated. */ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_THISNODE | __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = node }; return alloc_migration_target(page, (unsigned long)&mtc); } /* * Take pages on @demote_list and attempt to demote them to * another node. Pages which are not demoted are left on * @demote_pages. */ static unsigned int demote_page_list(struct list_head *demote_pages, struct pglist_data *pgdat) { int target_nid = next_demotion_node(pgdat->node_id); unsigned int nr_succeeded; int err; if (list_empty(demote_pages)) return 0; if (target_nid == NUMA_NO_NODE) return 0; /* Demotion ignores all cpuset and mempolicy settings */ err = migrate_pages(demote_pages, alloc_demote_page, NULL, target_nid, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); if (current_is_kswapd()) __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); else __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); return nr_succeeded; } /* * shrink_page_list() returns the number of reclaimed pages */ static unsigned int shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); LIST_HEAD(demote_pages); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); retry: while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback, may_enter_fs; unsigned int nr_pages; cond_resched(); page = lru_to_page(page_list); list_del(&page->lru); if (!trylock_page(page)) goto keep; VM_BUG_ON_PAGE(PageActive(page), page); nr_pages = compound_nr(page); /* Account the number of base pages even though THP */ sc->nr_scanned += nr_pages; if (unlikely(!page_evictable(page))) goto activate_locked; if (!sc->may_unmap && page_mapped(page)) goto keep_locked; may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* * The number of dirty pages determines if a node is marked * reclaim_congested which affects wait_iff_congested. kswapd * will stall and start writing pages if the tail of the LRU * is all dirty unqueued pages. */ page_check_dirty_writeback(page, &dirty, &writeback); if (dirty || writeback) stat->nr_dirty++; if (dirty && !writeback) stat->nr_unqueued_dirty++; /* * Treat this page as congested if the underlying BDI is or if * pages are cycling through the LRU so quickly that the * pages marked for immediate reclaim are making it to the * end of the LRU a second time. */ mapping = page_mapping(page); if (((dirty || writeback) && mapping && inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) stat->nr_congested++; /* * If a page at the tail of the LRU is under writeback, there * are three cases to consider. * * 1) If reclaim is encountering an excessive number of pages * under writeback and this page is both under writeback and * PageReclaim then it indicates that pages are being queued * for IO but are being recycled through the LRU before the * IO can complete. Waiting on the page itself risks an * indefinite stall if it is impossible to writeback the * page due to IO error or disconnected storage so instead * note that the LRU is being scanned too quickly and the * caller can stall after page list has been processed. * * 2) Global or new memcg reclaim encounters a page that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, * not to fs). In this case mark the page for immediate * reclaim and continue scanning. * * Require may_enter_fs because we would wait on fs, which * may not have submitted IO yet. And the loop driver might * enter reclaim, and deadlock if it waits on a page for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * * 3) Legacy memcg encounters a page that is already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * * In cases 1) and 2) we activate the pages to get them out of * the way while we continue scanning for clean pages on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. * Since they're marked for immediate reclaim, they won't put * memory pressure on the cache working set any longer than it * takes to write them to disk. */ if (PageWriteback(page)) { /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { stat->nr_immediate++; goto activate_locked; /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() * might have just cleared PageReclaim, then * setting PageReclaim here end up interpreted * as PageReadahead - but that does not matter * enough to care. What we do want is for this * page to have PageReclaim set next time memcg * reclaim reaches the tests above, so it will * then wait_on_page_writeback() to avoid OOM; * and it's also appropriate in global reclaim. */ SetPageReclaim(page); stat->nr_writeback++; goto activate_locked; /* Case 3 above */ } else { unlock_page(page); wait_on_page_writeback(page); /* then go back and try same page again */ list_add_tail(&page->lru, page_list); continue; } } if (!ignore_references) references = page_check_references(page, sc); switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; case PAGEREF_KEEP: stat->nr_ref_keep += nr_pages; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: ; /* try to reclaim the page below */ } /* * Before reclaiming the page, try to relocate * its contents to another node. */ if (do_demote_pass && (thp_migration_supported() || !PageTransHuge(page))) { list_add(&page->lru, &demote_pages); unlock_page(page); continue; } /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree page could be freed directly */ if (PageAnon(page) && PageSwapBacked(page)) { if (!PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (page_maybe_dma_pinned(page)) goto keep_locked; if (PageTransHuge(page)) { /* cannot split THP, skip it */ if (!can_split_huge_page(page, NULL)) goto activate_locked; /* * Split pages without a PMD map right * away. Chances are some or all of the * tail pages can be freed without IO. */ if (!compound_mapcount(page) && split_huge_page_to_list(page, page_list)) goto activate_locked; } if (!add_to_swap(page)) { if (!PageTransHuge(page)) goto activate_locked_split; /* Fallback to swap normal pages */ if (split_huge_page_to_list(page, page_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); #endif if (!add_to_swap(page)) goto activate_locked_split; } may_enter_fs = true; /* Adding to swap updated mapping */ mapping = page_mapping(page); } } else if (unlikely(PageTransHuge(page))) { /* Split file THP */ if (split_huge_page_to_list(page, page_list)) goto keep_locked; } /* * THP may get split above, need minus tail pages and update * nr_pages to avoid accounting tail pages twice. * * The tail pages that are added into swap cache successfully * reach here. */ if ((nr_pages > 1) && !PageTransHuge(page)) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (page_mapped(page)) { enum ttu_flags flags = TTU_BATCH_FLUSH; bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; try_to_unmap(page, flags); if (page_mapped(page)) { stat->nr_unmap_fail += nr_pages; if (!was_swapbacked && PageSwapBacked(page)) stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } if (PageDirty(page)) { /* * Only kswapd can writeback filesystem pages * to avoid risk of stack overflow. But avoid * injecting inefficient single-page IO into * flusher writeback as much as possible: only * write pages when we've encountered many * dirty pages, and when we've already scanned * the rest of the LRU for clean pages and see * the same dirty pages again (PageReclaim). */ if (page_is_file_lru(page) && (!current_is_kswapd() || !PageReclaim(page) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() * except we already have the page isolated * and know it's dirty */ inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page); goto activate_locked; } if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) goto keep_locked; if (!sc->may_writepage) goto keep_locked; /* * Page is dirty. Flush the TLB if a writable entry * potentially exists to avoid CPU writes after IO * starts and then write it out here. */ try_to_unmap_flush_dirty(); switch (pageout(page, mapping)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: stat->nr_pageout += thp_nr_pages(page); if (PageWriteback(page)) goto keep; if (PageDirty(page)) goto keep; /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. */ if (!trylock_page(page)) goto keep; if (PageDirty(page) || PageWriteback(page)) goto keep_locked; mapping = page_mapping(page); fallthrough; case PAGE_CLEAN: ; /* try to free the page below */ } } /* * If the page has buffers, try to free the buffer mappings * associated with this page. If we succeed we try to free * the page as well. * * We do this even if the page is PageDirty(). * try_to_release_page() does not perform I/O, but it is * possible for a page to have PageDirty set, but it is actually * clean (all its buffers are clean). This happens if the * buffers were written out directly, with submit_bh(). ext3 * will do this, as well as the blockdev mapping. * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * * Rarely, pages can have buffers and no ->mapping. These are * the pages which were not successfully invalidated in * truncate_cleanup_page(). We try to drop those buffers here * and if that worked, and the page is no longer mapped into * process address space (page_count == 1) it can be freed. * Otherwise, leave the page on the LRU so it is swappable. */ if (page_has_private(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; if (!mapping && page_count(page) == 1) { unlock_page(page); if (put_page_testzero(page)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free * this page shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU). */ nr_reclaimed++; continue; } } } if (PageAnon(page) && !PageSwapBacked(page)) { /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; /* * The page has only one reference left, which is * from the isolation. After the caller puts the * page back on lru and drops the reference, the * page will be freed anyway. It doesn't matter * which lru it goes. So we don't bother checking * PageDirty here. */ count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true, sc->target_mem_cgroup)) goto keep_locked; unlock_page(page); free_it: /* * THP may get swapped out in a whole, need account * all base pages. */ nr_reclaimed += nr_pages; /* * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ if (unlikely(PageTransHuge(page))) destroy_compound_page(page); else list_add(&page->lru, &free_pages); continue; activate_locked_split: /* * The tail pages that are failed to add into swap cache * reach here. Fixup nr_scanned and nr_pages. */ if (nr_pages > 1) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || PageMlocked(page))) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { int type = page_is_file_lru(page); SetPageActive(page); stat->nr_activate[type] += nr_pages; count_memcg_page_event(page, PGACTIVATE); } keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } /* 'page_list' is always empty here */ /* Migrate pages selected for demotion */ nr_reclaimed += demote_page_list(&demote_pages, pgdat); /* Pages that could not be demoted are still in @demote_pages */ if (!list_empty(&demote_pages)) { /* Pages which failed to demoted go back on @page_list for retry: */ list_splice_init(&demote_pages, page_list); do_demote_pass = false; goto retry; } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; mem_cgroup_uncharge_list(&free_pages); try_to_unmap_flush(); free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, }; struct reclaim_stat stat; unsigned int nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages); unsigned int noreclaim_flag; list_for_each_entry_safe(page, next, page_list, lru) { if (!PageHuge(page) && page_is_file_lru(page) && !PageDirty(page) && !__PageMovable(page) && !PageUnevictable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } } /* * We should be safe here since we are only dealing with file pages and * we are not kswapd and therefore cannot write dirty file pages. But * call memalloc_noreclaim_save() anyway, just in case these conditions * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, &stat, true); memalloc_noreclaim_restore(noreclaim_flag); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed); /* * Since lazyfree pages are isolated from file LRU from the beginning, * they will rotate back to anonymous LRU in the end if it failed to * discard so isolated count will be mismatched. * Compensate the isolated count for both LRU lists. */ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, stat.nr_lazyfree_fail); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)stat.nr_lazyfree_fail); return nr_reclaimed; } /* * Update LRU sizes after isolating pages. The LRU size updates must * be complete before mem_cgroup_update_lru_size due to a sanity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsigned long *nr_zone_taken) { int zid; for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_zone_taken[zid]) continue; update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); } } /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * * lruvec->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * * For pagecache intensive workloads, this function is the hottest * spot in the kernel (apart from copy_*_user functions). * * Lru_lock must be held before calling this function. * * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; unsigned long skipped = 0; unsigned long scan, total_scan, nr_pages; LIST_HEAD(pages_skipped); total_scan = 0; scan = 0; while (scan < nr_to_scan && !list_empty(src)) { struct list_head *move_to = src; struct page *page; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); nr_pages = compound_nr(page); total_scan += nr_pages; if (page_zonenum(page) > sc->reclaim_idx) { nr_skipped[page_zonenum(page)] += nr_pages; move_to = &pages_skipped; goto move; } /* * Do not count skipped pages because that makes the function * return with no isolated pages if the LRU mostly contains * ineligible pages. This causes the VM to not reclaim any * pages, triggering a premature OOM. * Account all tail pages of THP. */ scan += nr_pages; if (!PageLRU(page)) goto move; if (!sc->may_unmap && page_mapped(page)) goto move; /* * Be careful not to clear PageLRU until after we're * sure the page is not being freed elsewhere -- the * page release code relies on it. */ if (unlikely(!get_page_unless_zero(page))) goto move; if (!TestClearPageLRU(page)) { /* Another thread is already isolating this page */ put_page(page); goto move; } nr_taken += nr_pages; nr_zone_taken[page_zonenum(page)] += nr_pages; move_to = dst; move: list_move(&page->lru, move_to); } /* * Splice any skipped pages to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX * scanning would soon rescan the same pages to skip and put the * system at risk of premature OOM. */ if (!list_empty(&pages_skipped)) { int zid; list_splice(&pages_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); skipped += nr_skipped[zid]; } } *nr_scanned = total_scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, total_scan, skipped, nr_taken, sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } /** * isolate_lru_page - tries to isolate a page from its LRU list * @page: page to isolate from its LRU list * * Isolates a @page from an LRU list, clears PageLRU and adjusts the * vmstat statistic corresponding to whatever LRU list the page was on. * * Returns 0 if the page was removed from an LRU list. * Returns -EBUSY if the page was not on an LRU list. * * The returned page will have PageLRU() cleared. If it was found on * the active list, it will have PageActive set. If it was found on * the unevictable list, it will have the PageUnevictable bit set. That flag * may need to be cleared by the caller before letting the page go. * * The vmstat statistic corresponding to the list on which the page was * found will be decremented. * * Restrictions: * * (1) Must be called with an elevated refcount on the page. This is a * fundamental difference from isolate_lru_pages (which is called * without a stable reference). * (2) the lru_lock must not be held. * (3) interrupts must be enabled. */ int isolate_lru_page(struct page *page) { int ret = -EBUSY; VM_BUG_ON_PAGE(!page_count(page), page); WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (TestClearPageLRU(page)) { struct lruvec *lruvec; get_page(page); lruvec = lock_page_lruvec_irq(page); del_page_from_lru_list(page, lruvec); unlock_page_lruvec_irq(lruvec); ret = 0; } return ret; } /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get rescheduled. When there are massive number of tasks doing page * allocation, such sleeping direct reclaimers may keep piling up on each CPU, * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; if (current_is_kswapd()) return 0; if (!writeback_throttling_sane(sc)) return 0; if (file) { inactive = node_page_state(pgdat, NR_INACTIVE_FILE); isolated = node_page_state(pgdat, NR_ISOLATED_FILE); } else { inactive = node_page_state(pgdat, NR_INACTIVE_ANON); isolated = node_page_state(pgdat, NR_ISOLATED_ANON); } /* * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; return isolated > inactive; } /* * move_pages_to_lru() moves pages from private @list to appropriate LRU list. * On return, @list is reused as a list of pages to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ static unsigned int move_pages_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); struct page *page; while (!list_empty(list)) { page = lru_to_page(list); VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { spin_unlock_irq(&lruvec->lru_lock); putback_lru_page(page); spin_lock_irq(&lruvec->lru_lock); continue; } /* * The SetPageLRU needs to be kept here for list integrity. * Otherwise: * #0 move_pages_to_lru #1 release_pages * if !put_page_testzero * if (put_page_testzero()) * !PageLRU //skip lru_lock * SetPageLRU() * list_add(&page->lru,) * list_add(&page->lru,) */ SetPageLRU(page); if (unlikely(put_page_testzero(page))) { __clear_page_lru_flags(page); if (unlikely(PageCompound(page))) { spin_unlock_irq(&lruvec->lru_lock); destroy_compound_page(page); spin_lock_irq(&lruvec->lru_lock); } else list_add(&page->lru, &pages_to_free); continue; } /* * All pages were isolated from the same lruvec (and isolation * inhibits memcg migration). */ VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page); add_page_to_lru_list(page, lruvec); nr_pages = thp_nr_pages(page); nr_moved += nr_pages; if (PageActive(page)) workingset_age_nonresident(lruvec, nr_pages); } /* * To save our caller's stack, now use input list for pages to free. */ list_splice(&pages_to_free, list); return nr_moved; } /* * If a kernel thread (such as nfsd for loop-back mounts) services * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE. * In that case we should only throttle if the backing device it is * writing to is congested. In other cases it is safe to throttle. */ static int current_may_throttle(void) { return !(current->flags & PF_LOCAL_THROTTLE) || current->backing_dev_info == NULL || bdi_write_congested(current->backing_dev_info); } /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { LIST_HEAD(page_list); unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat; bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { if (stalled) return 0; /* wait a bit for the reclaimer. */ msleep(100); stalled = true; /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) return SWAP_CLUSTER_MAX; } lru_add_drain(); spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); if (nr_taken == 0) return 0; nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); spin_lock_irq(&lruvec->lru_lock); move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout); mem_cgroup_uncharge_list(&page_list); free_unref_page_list(&page_list); /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty pages to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of * dirty pages grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. */ if (stat.nr_unqueued_dirty == nr_taken) wakeup_flusher_threads(WB_REASON_VMSCAN); sc->nr.dirty += stat.nr_dirty; sc->nr.congested += stat.nr_congested; sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr.writeback += stat.nr_writeback; sc->nr.immediate += stat.nr_immediate; sc->nr.taken += nr_taken; if (file) sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; } /* * shrink_active_list() moves pages from the active LRU to the inactive LRU. * * We move them the other way if the page is referenced by one or more * processes. * * If the pages are mostly unmapped, the processing is fast and it is * appropriate to hold lru_lock across the whole operation. But if * the pages are mapped, the processing is slow (page_referenced()), so * we should drop lru_lock around each page. It's impossible to balance * this, so instead we remove the pages from the LRU while processing them. * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. * * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. */ static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { unsigned long nr_taken; unsigned long nr_scanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); if (!cgroup_reclaim(sc)) __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); list_del(&page->lru); if (unlikely(!page_evictable(page))) { putback_lru_page(page); continue; } if (unlikely(buffer_heads_over_limit)) { if (page_has_private(page) && trylock_page(page)) { if (page_has_private(page)) try_to_release_page(page, 0); unlock_page(page); } } if (page_referenced(page, 0, sc->target_mem_cgroup, &vm_flags)) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So * that executable code get better chances to stay in * memory under moderate memory pressure. Anon pages * are not likely to be evicted by use-once streaming * IO, plus JVM can create lots of anon VM_EXEC pages, * so we ignore them here. */ if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { nr_rotated += thp_nr_pages(page); list_add(&page->lru, &l_active); continue; } } ClearPageActive(page); /* we are de-activating */ SetPageWorkingset(page); list_add(&page->lru, &l_inactive); } /* * Move pages back to the lru list. */ spin_lock_irq(&lruvec->lru_lock); nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); /* Keep all free pages in l_active list */ list_splice(&l_inactive, &l_active); __count_vm_events(PGDEACTIVATE, nr_deactivate); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } unsigned long reclaim_pages(struct list_head *page_list) { int nid = NUMA_NO_NODE; unsigned int nr_reclaimed = 0; LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; struct page *page; unsigned int noreclaim_flag; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, .no_demotion = 1, }; noreclaim_flag = memalloc_noreclaim_save(); while (!list_empty(page_list)) { page = lru_to_page(page_list); if (nid == NUMA_NO_NODE) { nid = page_to_nid(page); INIT_LIST_HEAD(&node_page_list); } if (nid == page_to_nid(page)) { ClearPageActive(page); list_move(&page->lru, &node_page_list); continue; } nr_reclaimed += shrink_page_list(&node_page_list, NODE_DATA(nid), &sc, &dummy_stat, false); while (!list_empty(&node_page_list)) { page = lru_to_page(&node_page_list); list_del(&page->lru); putback_lru_page(page); } nid = NUMA_NO_NODE; } if (!list_empty(&node_page_list)) { nr_reclaimed += shrink_page_list(&node_page_list, NODE_DATA(nid), &sc, &dummy_stat, false); while (!list_empty(&node_page_list)) { page = lru_to_page(&node_page_list); list_del(&page->lru); putback_lru_page(page); } } memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { if (sc->may_deactivate & (1 << is_file_lru(lru))) shrink_active_list(nr_to_scan, lruvec, sc, lru); else sc->skipped_deactivate = 1; return 0; } return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } /* * The inactive anon list should be small enough that the VM never has * to do too much work. * * The inactive file list should be small enough to leave most memory * to the established workingset on the scan-resistant active list, * but large enough to avoid thrashing the aggregate readahead window. * * Both inactive lists should also be large enough that each inactive * page has a chance to be referenced again before it is reclaimed. * * If that fails and refaulting is observed, the inactive list grows. * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages * on this LRU, maintained by the pageout code. An inactive_ratio * of 3 means 3:1 or 25% of the pages are kept on the inactive list. * * total target max * memory ratio inactive * ------------------------------------- * 10MB 1 5MB * 100MB 1 50MB * 1GB 3 250MB * 10GB 10 0.9GB * 100GB 31 3GB * 1TB 101 10GB * 10TB 320 32GB */ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; unsigned long inactive_ratio; unsigned long gb; inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) inactive_ratio = int_sqrt(10 * gb); else inactive_ratio = 1; return inactive * inactive_ratio < active; } enum scan_balance { SCAN_EQUAL, SCAN_FRACT, SCAN_ANON, SCAN_FILE, }; /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; unsigned long ap, fp; enum lru_list lru; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; } /* * Global reclaim will swap to prevent OOM even with no * swappiness, but memcg users want to use this knob to * disable swapping for individual groups completely when * using the memory controller's swap limit feature would be * too expensive. */ if (cgroup_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } /* * Do not apply any pressure balancing cleverness when the * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } /* * If the system is almost out of file pages, force-scan anon. */ if (sc->file_is_tiny) { scan_balance = SCAN_ANON; goto out; } /* * If there is enough inactive page cache, we do not reclaim * anything from the anonymous working right now. */ if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; goto out; } scan_balance = SCAN_FRACT; /* * Calculate the pressure balance between anon and file pages. * * The amount of pressure we put on each LRU is inversely * proportional to the cost of reclaiming each list, as * determined by the share of pages that are refaulting, times * the relative IO cost of bringing back a swapped out * anonymous page vs reloading a filesystem page (swappiness). * * Although we limit that influence to ensure no list gets * left behind completely: at least a third of the pressure is * applied, before swappiness. * * With swappiness at 100, anon and file have equal IO cost. */ total_cost = sc->anon_cost + sc->file_cost; anon_cost = total_cost + sc->anon_cost; file_cost = total_cost + sc->file_cost; total_cost = anon_cost + file_cost; ap = swappiness * (total_cost + 1); ap /= anon_cost + 1; fp = (200 - swappiness) * (total_cost + 1); fp /= file_cost + 1; fraction[0] = ap; fraction[1] = fp; denominator = ap + fp; out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); unsigned long lruvec_size; unsigned long low, min; unsigned long scan; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min * setting. * * This is important, as otherwise scanning aggression * becomes extremely binary -- from nothing as we * approach the memory protection threshold, to totally * nominal as we exceed it. This results in requiring * setting extremely liberal protection thresholds. It * also means we simply get no protection at all if we * set it too low, which is not ideal. * * If there is any protection in place, we reduce scan * pressure by how much of the total memory used is * within protection thresholds. * * There is one special case: in the first reclaim pass, * we skip over all groups that are within their low * protection. If that fails to reclaim enough pages to * satisfy the reclaim goal, we come back and override * the best-effort low protection. However, we still * ideally want to honor how well-behaved groups are in * that case instead of simply punishing them all * equally. As such, we reclaim them based on how much * memory they are using, reducing the scan pressure * again by how much of the total memory used is under * hard protection. */ unsigned long cgroup_size = mem_cgroup_size(memcg); unsigned long protection; /* memory.low scaling, make sure we retry before OOM */ if (!sc->memcg_low_reclaim && low > min) { protection = low; sc->memcg_low_skipped = 1; } else { protection = min; } /* Avoid TOCTOU with earlier protection check */ cgroup_size = max(cgroup_size, protection); scan = lruvec_size - lruvec_size * protection / (cgroup_size + 1); /* * Minimally target SWAP_CLUSTER_MAX pages to keep * reclaim moving forwards, avoiding decrementing * sc->priority further than desirable. */ scan = max(scan, SWAP_CLUSTER_MAX); } else { scan = lruvec_size; } scan >>= sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: /* Scan lists relative to size */ break; case SCAN_FRACT: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. * Make sure we don't miss the last page on * the offlined memory cgroups because of a * round-off error. */ scan = mem_cgroup_online(memcg) ? div64_u64(scan * fraction[file], denominator) : DIV64_U64_ROUND_UP(scan * fraction[file], denominator); break; case SCAN_FILE: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) scan = 0; break; default: /* Look ma, no brain */ BUG(); } nr[lru] = scan; } } /* * Anonymous LRU management is a waste if there is * ultimately no way to reclaim the memory. */ static bool can_age_anon_pages(struct pglist_data *pgdat, struct scan_control *sc) { /* Aging the anon LRU is valuable if swap is present: */ if (total_swap_pages > 0) return true; /* Also valuable if anon pages can be demoted: */ return can_demote(pgdat->node_id, sc); } static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; bool proportional_reclaim; struct blk_plug plug; get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); /* * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal * event that can occur when there is little memory pressure e.g. * multiple streaming readers/writers. Hence, we do not abort scanning * when the requested number of pages are reclaimed when scanning at * DEF_PRIORITY on the assumption that the fact we are direct * reclaiming implies that kswapd is not keeping up and it is best to * do a batch of work at once. For memcg reclaim one check is made to * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass. */ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { unsigned long nr_anon, nr_file, percentage; unsigned long nr_scanned; for_each_evictable_lru(lru) { if (nr[lru]) { nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, lruvec, sc); } } cond_resched(); if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) continue; /* * For kswapd and memcg, reclaim at least the number of pages * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. */ nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; /* * It's just vindictive to attack the larger once the smaller * has gone to zero. And given the way we stop scanning the * smaller below, this makes sure that we only make one nudge * towards proportionality once we've got nr_to_reclaim. */ if (!nr_file || !nr_anon) break; if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; lru = LRU_BASE; percentage = nr_anon * 100 / scan_target; } else { unsigned long scan_target = targets[LRU_INACTIVE_FILE] + targets[LRU_ACTIVE_FILE] + 1; lru = LRU_FILE; percentage = nr_file * 100 / scan_target; } /* Stop scanning the smaller of the LRU */ nr[lru] = 0; nr[lru + LRU_ACTIVE] = 0; /* * Recalculate the other LRU scan count based on its original * scan target and the percentage scanning already complete */ lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); lru += LRU_ACTIVE; nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; return false; } /* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; int z; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) return false; /* * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX * number of pages that were scanned. This will return to the caller * with the risk reclaim/compaction and the resulting allocation attempt * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL * allocations through requiring that the full LRU list has been scanned * first, by assuming that zero delta of sc->nr_scanned means full LRU * scan, but that approximation was wrong, and there were corner cases * where always a non-zero amount of pages were scanned. */ if (!nr_reclaimed) return false; /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { struct zone *zone = &pgdat->node_zones[z]; if (!managed_zone(zone)) continue; switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { case COMPACT_SUCCESS: case COMPACT_CONTINUE: return false; default: /* check next zone */ ; } } /* * If we have not reclaimed enough pages for compaction and the * inactive lists are large enough, continue reclaiming */ pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); return inactive_lru_pages > pages_for_compaction; } static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; struct mem_cgroup *memcg; memcg = mem_cgroup_iter(target_memcg, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long reclaimed; unsigned long scanned; /* * This loop can become CPU-bound when target memcgs * aren't eligible for reclaim - either because they * don't have any reclaimable pages, or because their * memory is explicitly protected. Avoid soft lockups. */ cond_resched(); mem_cgroup_calculate_protection(target_memcg, memcg); if (mem_cgroup_below_min(memcg)) { /* * Hard protection. * If there is no reclaimable memory, OOM. */ continue; } else if (mem_cgroup_below_low(memcg)) { /* * Soft protection. * Respect the protection only as long as * there is an unprotected supply * of reclaimable memory from other cgroups. */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; shrink_lruvec(lruvec, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; struct lruvec *target_lruvec; bool reclaimable = false; unsigned long file; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: /* * Flush the memory cgroup stats, so that we read accurate per-memcg * lruvec stats for heuristics. */ mem_cgroup_flush_stats(); memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; /* * Determine the scan balance between anon and file LRUs. */ spin_lock_irq(&target_lruvec->lru_lock); sc->anon_cost = target_lruvec->anon_cost; sc->file_cost = target_lruvec->file_cost; spin_unlock_irq(&target_lruvec->lru_lock); /* * Target desirable inactive:active list ratios for the anon * and file LRU lists. */ if (!sc->force_deactivate) { unsigned long refaults; refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); if (refaults != target_lruvec->refaults[0] || inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) sc->may_deactivate |= DEACTIVATE_ANON; else sc->may_deactivate &= ~DEACTIVATE_ANON; /* * When refaults are being observed, it means a new * workingset is being established. Deactivate to get * rid of any stale active pages quickly. */ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); if (refaults != target_lruvec->refaults[1] || inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) sc->may_deactivate |= DEACTIVATE_FILE; else sc->may_deactivate &= ~DEACTIVATE_FILE; } else sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; /* * If we have plenty of inactive file pages that aren't * thrashing, try to reclaim those first before touching * anonymous pages. */ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) sc->cache_trim_mode = 1; else sc->cache_trim_mode = 0; /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip * the scan balance towards the file LRU. And as the file LRU * shrinks, so does the window for rotation from references. * This means we have a runaway feedback loop where a tiny * thrashing file LRU becomes infinitely more attractive than * anon pages. Try to detect this based on file LRU size. */ if (!cgroup_reclaim(sc)) { unsigned long total_high_wmark = 0; unsigned long free, anon; int z; free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); file = node_page_state(pgdat, NR_ACTIVE_FILE) + node_page_state(pgdat, NR_INACTIVE_FILE); for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = &pgdat->node_zones[z]; if (!managed_zone(zone)) continue; total_high_wmark += high_wmark_pages(zone); } /* * Consider anon: if that's low too, this isn't a * runaway file reclaim problem, but rather just * extreme pressure. Reclaim as per usual then. */ anon = node_page_state(pgdat, NR_INACTIVE_ANON); sc->file_is_tiny = file + free <= total_high_wmark && !(sc->may_deactivate & DEACTIVATE_ANON) && anon >> sc->priority; } shrink_node_memcgs(pgdat, sc); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } /* Record the subtree's reclaim efficiency */ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; if (current_is_kswapd()) { /* * If reclaim is isolating dirty pages under writeback, * it implies that the long-lived page allocation rate * is exceeding the page laundering rate. Either the * global limits are not being effective at throttling * processes due to the page distribution throughout * zones or there is heavy usage of a slow backing * device. The only option is to throttle from reclaim * context which is not ideal as there is no guarantee * the dirtying process is throttled in the same way * balance_dirty_pages() manages. * * Once a node is flagged PGDAT_WRITEBACK, kswapd will * count the number of pages under pages flagged for * immediate reclaim and stall if any are encountered * in the nr_immediate check below. */ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* Allow kswapd to start writing pages during reclaim.*/ if (sc->nr.unqueued_dirty == sc->nr.file_taken) set_bit(PGDAT_DIRTY, &pgdat->flags); /* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it * implies that pages are cycling through the LRU * faster than they are written so also forcibly stall. */ if (sc->nr.immediate) congestion_wait(BLK_RW_ASYNC, HZ/10); } /* * Tag a node/memcg as congested if all the dirty pages * scanned were backed by a congested BDI and * wait_iff_congested will stall. * * Legacy memcg will stall in page writeback so avoid forcibly * stalling in wait_iff_congested(). */ if ((current_is_kswapd() || (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && sc->nr.dirty && sc->nr.dirty == sc->nr.congested) set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); /* * Stall direct reclaim for IO completions if underlying BDIs * and node is congested. Allow kswapd to continue until it * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) wait_iff_congested(BLK_RW_ASYNC, HZ/10); if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc)) goto again; /* * Kswapd gives up on balancing particular nodes after too * many failures to reclaim anything from them and goes to * sleep. On reclaim progress, reset the failure counter. A * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) pgdat->kswapd_failures = 0; } /* * Returns true if compaction should go ahead for a costly-order request, or * the allocation would already succeed without compaction. Return false if we * should reclaim first. */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; enum compact_result suitable; suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); if (suitable == COMPACT_SUCCESS) /* Allocation should succeed already. Don't reclaim. */ return true; if (suitable == COMPACT_SKIPPED) /* Compaction cannot yet proceed. Do reclaim. */ return false; /* * Compaction is already possible, but it takes time to run and there * are potentially other callers using the pages just freed. So proceed * with reclaim to make a buffer of free pages available to give * compaction a reasonable chance of completing and allocating the page. * Note that we won't actually reclaim the whole buffer in one attempt * as the target watermark in should_continue_reclaim() is lower. But if * we are already above the high+gap watermark, don't reclaim at all. */ watermark = high_wmark_pages(zone) + compact_gap(sc->order); return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); } /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; gfp_t orig_mask; pg_data_t *last_pgdat = NULL; /* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) { sc->gfp_mask |= __GFP_HIGHMEM; sc->reclaim_idx = gfp_zone(sc->gfp_mask); } for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, sc->nodemask) { /* * Take care memory controller reclaiming has small influence * to global LRU. */ if (!cgroup_reclaim(sc)) { if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) continue; /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. * Even though compaction is invoked for any * non-zero order, only frequent costly order * reclamation is disruptive enough to become a * noticeable problem, like transparent huge * page allocations. */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && compaction_ready(zone, sc)) { sc->compaction_ready = true; continue; } /* * Shrink each node in the zonelist once. If the * zonelist is ordered by zone (not the default) then a * node may be shrunk multiple times but in that case * the user prefers lower zones being preserved. */ if (zone->zone_pgdat == last_pgdat) continue; /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and * scanned pages. This works for global memory pressure * and balancing, not for a memcg's limit. */ nr_soft_scanned = 0; nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, sc->order, sc->gfp_mask, &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } /* See comment about same check for global reclaim above */ if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); } /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; } static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { struct lruvec *target_lruvec; unsigned long refaults; target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[0] = refaults; refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); target_lruvec->refaults[1] = refaults; } /* * This is the main entry point to direct page reclaim. * * If a full scan of the inactive list fails to free enough memory then we * are "out of memory" and something needs to be killed. * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this * caller can't do much about. We kick the writeback threads and take explicit * naps in the hope that some of these pages can be written. But if the * allocating task holds filesystem locks which prevent writeout this might not * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int initial_priority = sc->priority; pg_data_t *last_pgdat; struct zoneref *z; struct zone *zone; retry: delayacct_freepages_start(); if (!cgroup_reclaim(sc)) __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; shrink_zones(zonelist, sc); if (sc->nr_reclaimed >= sc->nr_to_reclaim) break; if (sc->compaction_ready) break; /* * If we're getting trouble reclaiming, start doing * writepage even in laptop mode. */ if (sc->priority < DEF_PRIORITY - 2) sc->may_writepage = 1; } while (--sc->priority >= 0); last_pgdat = NULL; for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, sc->nodemask) { if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); if (cgroup_reclaim(sc)) { struct lruvec *lruvec; lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); clear_bit(LRUVEC_CONGESTED, &lruvec->flags); } } delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; /* Aborted reclaim to try compaction? don't OOM, then */ if (sc->compaction_ready) return 1; /* * We make inactive:active ratio decisions based on the node's * composition of memory, but a restrictive reclaim_idx or a * memory.low cgroup setting can exempt large amounts of * memory from reclaim. Neither of which are very common, so * instead of doing costly eligibility calculations of the * entire cgroup subtree up front, we assume the estimates are * good, and retry with forcible deactivation if that fails. */ if (sc->skipped_deactivate) { sc->priority = initial_priority; sc->force_deactivate = 1; sc->skipped_deactivate = 0; goto retry; } /* Untapped cgroup reserves? Don't OOM, retry. */ if (sc->memcg_low_skipped) { sc->priority = initial_priority; sc->force_deactivate = 0; sc->memcg_low_reclaim = 1; sc->memcg_low_skipped = 0; goto retry; } return 0; } static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; unsigned long free_pages = 0; int i; bool wmark_ok; if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; if (!managed_zone(zone)) continue; if (!zone_reclaimable_pages(zone)) continue; pfmemalloc_reserve += min_wmark_pages(zone); free_pages += zone_page_state(zone, NR_FREE_PAGES); } /* If there are no reserves (unexpected config) then do not throttle */ if (!pfmemalloc_reserve) return true; wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } return wmark_ok; } /* * Throttle direct reclaimers if backing storage is backed by the network * and the PFMEMALLOC reserve for the preferred node is getting dangerously * depleted. kswapd will continue to make progress and wake the processes * when the low watermark is reached. * * Returns true if a fatal signal was delivered during throttling. If this * happens, the page allocator should not consider triggering the OOM killer. */ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { struct zoneref *z; struct zone *zone; pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly * responsible for cleaning pages necessary for reclaim to make forward * progress. kjournald for example may enter direct reclaim while * committing a transaction where throttling it could forcing other * processes to block on log_wait_commit(). */ if (current->flags & PF_KTHREAD) goto out; /* * If a fatal signal is pending, this process should not throttle. * It should return quickly so it can exit and free its memory */ if (fatal_signal_pending(current)) goto out; /* * Check if the pfmemalloc reserves are ok by finding the first node * with a usable ZONE_NORMAL or lower zone. The expectation is that * GFP_KERNEL will be required for allocating network buffers when * swapping over the network so ZONE_HIGHMEM is unusable. * * Throttling is based on the first usable node and throttled processes * wait on a queue until kswapd makes progress and wakes them. There * is an affinity then between processes waking up and where reclaim * progress has been made assuming the process wakes on the same node. * More importantly, processes running on remote nodes will not compete * for remote pfmemalloc reserves and processes on different nodes * should make reasonable progress. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { if (zone_idx(zone) > ZONE_NORMAL) continue; /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; if (allow_direct_reclaim(pgdat)) goto out; break; } /* If no zone was usable by the allocation flags then do not throttle */ if (!pgdat) goto out; /* Account for the throttling */ count_vm_event(PGSCAN_DIRECT_THROTTLE); /* * If the caller cannot enter the filesystem, it's possible that it * is due to the caller holding an FS lock or performing a journal * transaction in the case of a filesystem like ext[3|4]. In this case, * it is not safe to block on pfmemalloc_wait as kswapd could be * blocked waiting on the same lock. Instead, throttle for up to a * second before continuing. */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat), HZ); else /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat)); if (fatal_signal_pending(current)) return true; out: return false; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = current_gfp_context(gfp_mask), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, }; /* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. */ BUILD_BUG_ON(MAX_ORDER > S8_MAX); BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); /* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this * point. */ if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); set_task_reclaim_state(current, NULL); return nr_reclaimed; } #ifdef CONFIG_MEMCG /* Only used by soft limit reclaim. Do not reuse for anything else. */ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned) { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; WARN_ON_ONCE(!current->reclaim_state); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.gfp_mask); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. * if we don't reclaim here, the shrink_node from balance_pgdat * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ shrink_lruvec(lruvec, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, bool may_swap) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = may_swap, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put * equal pressure on all the nodes. This is based on the assumption that * the reclaim does not bail out early. */ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); memalloc_noreclaim_restore(noreclaim_flag); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); set_task_reclaim_state(current, NULL); return nr_reclaimed; } #endif static void age_active_anon(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; struct lruvec *lruvec; if (!can_age_anon_pages(pgdat, sc)) return; lruvec = mem_cgroup_lruvec(NULL, pgdat); if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) return; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { lruvec = mem_cgroup_lruvec(memcg, pgdat); shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); memcg = mem_cgroup_iter(NULL, memcg, NULL); } while (memcg); } static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { int i; struct zone *zone; /* * Check for watermark boosts top-down as the higher zones * are more likely to be boosted. Both watermarks and boosts * should not be checked at the same time as reclaim would * start prematurely when there is no boosting and a lower * zone is balanced. */ for (i = highest_zoneidx; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; if (zone->watermark_boost) return true; } return false; } /* * Returns true if there is an eligible zone balanced for the request order * and highest_zoneidx */ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long mark = -1; struct zone *zone; /* * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } /* * If a node has no populated zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ if (mark == -1) return true; return false; } /* Clear pgdat state for congested, dirty or under writeback. */ static void clear_pgdat_congested(pg_data_t *pgdat) { struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); clear_bit(LRUVEC_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } /* * Prepare kswapd for sleeping. This verifies that there are no processes * waiting in throttle_direct_reclaim() and that watermarks have been met. * * Returns true if kswapd is ready to sleep */ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int highest_zoneidx) { /* * The throttled processes are normally woken up in balance_pgdat() as * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the * zones, which causes kswapd to exit balance_pgdat() before reaching * the wake up checks. If kswapd is going to sleep, no process should * be sleeping on pfmemalloc_wait, so wake them now if necessary. If * the wake up is premature, processes will wake kswapd and get * throttled again. The difference from wake ups in balance_pgdat() is * that here we are under prepare_to_wait(). */ if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { clear_pgdat_congested(pgdat); return true; } return false; } /* * kswapd shrinks a node of pages that are at or below the highest usable * zone that is currently unbalanced. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised. */ static bool kswapd_shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct zone *zone; int z; /* Reclaim a number of pages proportional to the number of zones */ sc->nr_to_reclaim = 0; for (z = 0; z <= sc->reclaim_idx; z++) { zone = pgdat->node_zones + z; if (!managed_zone(zone)) continue; sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); } /* * Historically care was taken to put equal pressure on all zones but * now pressure is applied based on node LRU order. */ shrink_node(pgdat, sc); /* * Fragmentation may mean that the system cannot be rebalanced for * high-order allocations. If twice the allocation size has been * reclaimed then recheck watermarks only at order-0 to prevent * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact. */ if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) sc->order = 0; return sc->nr_scanned >= sc->nr_to_reclaim; } /* Page allocator PCP high watermark is lowered if reclaim is active. */ static inline void update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) { int i; struct zone *zone; for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; if (active) set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); else clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); } } static inline void set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) { update_reclaim_active(pgdat, highest_zoneidx, true); } static inline void clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) { update_reclaim_active(pgdat, highest_zoneidx, false); } /* * For kswapd, balance_pgdat() will reclaim pages across a node from zones * that are eligible for use by the caller until at least one zone is * balanced. * * Returns the order kswapd finished reclaiming at. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced. */ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long pflags; unsigned long nr_boost_reclaim; unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; bool boosted; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, .may_unmap = 1, }; set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(_THIS_IP_); count_vm_event(PAGEOUTRUN); /* * Account for the reclaim boost. Note that the zone boost is left in * place so that parallel allocations that are near the watermark will * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; nr_boost_reclaim += zone->watermark_boost; zone_boosts[i] = zone->watermark_boost; } boosted = nr_boost_reclaim; restart: set_reclaim_active(pgdat, highest_zoneidx); sc.priority = DEF_PRIORITY; do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; bool balanced; bool ret; sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed * then consider reclaiming from all zones. This has a dual * purpose -- on 64-bit systems it is expected that * buffer_heads are stripped during active rotation. On 32-bit * systems, highmem pages can pin lowmem memory and shrinking * buffers can relieve lowmem pressure. Reclaim may still not * go ahead if all eligible zones for the original allocation * request are balanced to avoid excessive reclaim from kswapd. */ if (buffer_heads_over_limit) { for (i = MAX_NR_ZONES - 1; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; sc.reclaim_idx = i; break; } } /* * If the pgdat is imbalanced then ignore boosting and preserve * the watermarks for a later time and restart. Note that the * zone watermarks will be still reset at the end of balancing * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */ balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; } /* * If boosting is not active then only reclaim if there are no * eligible zones. Note that sc.reclaim_idx is not used as * buffer_heads_over_limit may have adjusted it. */ if (!nr_boost_reclaim && balanced) goto out; /* Limit the priority of boosting to avoid reclaim writeback */ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) raise_priority = false; /* * Do not writeback or swap pages for boosted reclaim. The * intent is to relieve pressure not issue sub-optimal IO * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted. */ sc.may_writepage = !laptop_mode && !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; /* * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. All * pages are rotated regardless of classzone as this is * about consistent aging. */ age_active_anon(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, sc.gfp_mask, &nr_soft_scanned); sc.nr_reclaimed += nr_soft_reclaimed; /* * There should be no need to raise the scanning priority if * enough pages are already being scanned that that high * watermark would be met at 100% efficiency. */ if (kswapd_shrink_node(pgdat, &sc)) raise_priority = false; /* * If the low watermark is met there is no need for processes * to be throttled on pfmemalloc_wait as they should not be * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); ret = try_to_freeze(); __fs_reclaim_acquire(_THIS_IP_); if (ret || kthread_should_stop()) break; /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); /* * If reclaim made no progress for a boost, stop reclaim as * IO cannot be queued and it could be an infinite loop in * extreme circumstances. */ if (nr_boost_reclaim && !nr_reclaimed) break; if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); if (!sc.nr_reclaimed) pgdat->kswapd_failures++; out: clear_reclaim_active(pgdat, highest_zoneidx); /* If reclaim was boosted, account for the reclaim done in this pass */ if (boosted) { unsigned long flags; for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; /* Increments are under the zone lock */ zone = pgdat->node_zones + i; spin_lock_irqsave(&zone->lock, flags); zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); spin_unlock_irqrestore(&zone->lock, flags); } /* * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller * entered the allocator slow path while kswapd was awake, order will * remain at the higher level. */ return sc.order; } /* * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is * not a valid index then either kswapd runs for first time or kswapd couldn't * sleep after previous reclaim attempt (node is still unbalanced). In that * case return the zone index of the previous kswapd reclaim cycle. */ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, enum zone_type prev_highest_zoneidx) { enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); if (freezing(current) || kthread_should_stop()) return; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* * Try to sleep for a short interval. Note that kcompactd will only be * woken if it is possible to sleep for a short interval. This is * deliberate on the assumption that if reclaim cannot keep an * eligible zone balanced that it's also unlikely that compaction will * succeed. */ if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. * When kswapd is going to sleep, it is reasonable to assume * that pages and compaction may succeed so reset the cache. */ reset_isolation_suitable(pgdat); /* * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { WRITE_ONCE(pgdat->kswapd_highest_zoneidx, kswapd_highest_zoneidx(pgdat, highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); } finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); } /* * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ if (!remaining && prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* * vmstat counters are not perfectly accurate and the estimated * value for counters such as NR_FREE_PAGES can deviate from the * true value by nr_online_cpus * threshold. To avoid the zone * watermarks being breached while under pressure, we reduce the * per-cpu vmstat threshold while kswapd is awake and restore * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); if (!kthread_should_stop()) schedule(); set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); else count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); } finish_wait(&pgdat->kswapd_wait, &wait); } /* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it * regardless (see "__alloc_pages()"). "kswapd" should * never get caught in the normal page freeing logic. * * (Kswapd normally doesn't need memory anyway, but sometimes * you need a small amount of memory in order to be able to * page out something else, and this flag essentially protects * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); for ( ; ; ) { bool ret; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, highest_zoneidx); /* Read the new order and highest_zoneidx */ alloc_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); ret = try_to_freeze(); if (kthread_should_stop()) break; /* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ if (ret) continue; /* * Reclaim begins at the requested order but if a high-order * reclaim fails then kswapd falls back to reclaiming for * order-0. If that happens, kswapd will consider sleeping * for the order it finished reclaiming at (reclaim_order) * but kcompactd is woken to compact for the original * request (alloc_order). */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); reclaim_order = balance_pgdat(pgdat, alloc_order, highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); return 0; } /* * A zone is low on free memory or too fragmented for high-order memory. If * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim * has failed or is not needed, still wake up kcompactd if only compaction is * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type highest_zoneidx) { pg_data_t *pgdat; enum zone_type curr_idx; if (!managed_zone(zone)) return; if (!cpuset_zone_allowed(zone, gfp_flags)) return; pgdat = zone->zone_pgdat; curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); if (READ_ONCE(pgdat->kswapd_order) < order) WRITE_ONCE(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd * and rely on compaction_suitable() to determine if it's * needed. If it fails, it will defer subsequent attempts to * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct scan_control sc = { .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, .reclaim_idx = MAX_NR_ZONES - 1, .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, .hibernation_mode = 1, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); unsigned long nr_reclaimed; unsigned int noreclaim_flag; fs_reclaim_acquire(sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(current, &sc.reclaim_state); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); set_task_reclaim_state(current, NULL); memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ /* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); if (pgdat->kswapd) return; pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); pgdat->kswapd = NULL; } } /* * Called by memory hotplug when all memory in a node is offlined. Caller must * hold mem_hotplug_begin/end(). */ void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; if (kswapd) { kthread_stop(kswapd); NODE_DATA(nid)->kswapd = NULL; } } static int __init kswapd_init(void) { int nid; swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); return 0; } module_init(kswapd_init) #ifdef CONFIG_NUMA /* * Node reclaim mode * * If non-zero call node_reclaim when the number of free pages falls below * the watermarks. */ int node_reclaim_mode __read_mostly; /* * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */ #define NODE_RECLAIM_PRIORITY 4 /* * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur. */ int sysctl_min_unmapped_ratio = 1; /* * If the number of slab pages in a zone grows beyond this percentage then * slab reclaim needs to occur. */ int sysctl_min_slab_ratio = 5; static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) { unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + node_page_state(pgdat, NR_ACTIVE_FILE); /* * It's possible for there to be more file mapped pages than * accounted for by the pages on the file LRU lists because * tmpfs pages accounted for as ANON can also be FILE_MAPPED */ return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) { unsigned long nr_pagecache_reclaimable; unsigned long delta = 0; /* * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and node_unmapped_file_pages() provides * a better estimate */ if (node_reclaim_mode & RECLAIM_UNMAP) nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); else nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); /* If we can't clean pages, remove dirty pages from consideration */ if (!(node_reclaim_mode & RECLAIM_WRITE)) delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) delta = nr_pagecache_reclaimable; return nr_pagecache_reclaimable - delta; } /* * Try to free up some pages from this node through reclaim. */ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = current_gfp_context(gfp_mask), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, .reclaim_idx = gfp_zone(gfp_mask), }; unsigned long pflags; trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, sc.gfp_mask); cond_resched(); psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE * and RECLAIM_UNMAP. */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; set_task_reclaim_state(p, &sc.reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. */ do { shrink_node(pgdat, &sc); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } set_task_reclaim_state(p, NULL); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); psi_memstall_leave(&pflags); trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); return sc.nr_reclaimed >= nr_pages; } int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { int ret; /* * Node reclaim reclaims unmapped file backed pages and * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately * thrown out if the node is overallocated. So we do not reclaim * if less than a specified percentage of the node is used by * unmapped file backed pages. */ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return NODE_RECLAIM_NOSCAN; /* * Only run node reclaim on the local node or on nodes that do not * have associated processors. This will favor the local processor * over remote processors and spread off node memory allocations * as wide as possible. */ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) return NODE_RECLAIM_NOSCAN; if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN; ret = __node_reclaim(pgdat, gfp_mask, order); clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); return ret; } #endif /** * check_move_unevictable_pages - check pages for evictability and move to * appropriate zone lru list * @pvec: pagevec with lru pages to check * * Checks pages for evictability, if an evictable page is in the unevictable * lru list, moves it to the appropriate evictable lru list. This function * should be only used for lru pages. */ void check_move_unevictable_pages(struct pagevec *pvec) { struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < pvec->nr; i++) { struct page *page = pvec->pages[i]; int nr_pages; if (PageTransTail(page)) continue; nr_pages = thp_nr_pages(page); pgscanned += nr_pages; /* block memcg migration during page moving between lru */ if (!TestClearPageLRU(page)) continue; lruvec = relock_page_lruvec_irq(page, lruvec); if (page_evictable(page) && PageUnevictable(page)) { del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); add_page_to_lru_list(page, lruvec); pgrescued += nr_pages; } SetPageLRU(page); } if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); unlock_page_lruvec_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
14 14 14 14 14 14 14 14 14 14 13 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 17 17 17 14 16 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 13 14 14 14 14 14 14 14 14 14 13 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018-2021 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/timer.h> #include <linux/rtnetlink.h> #include <net/codel.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "sta_info.h" #include "debugfs_sta.h" #include "mesh.h" #include "wme.h" /** * DOC: STA information lifetime rules * * STA info structures (&struct sta_info) are managed in a hash table * for faster lookup and a list for iteration. They are managed using * RCU, i.e. access to the list and hash table is protected by RCU. * * Upon allocating a STA info structure with sta_info_alloc(), the caller * owns that structure. It must then insert it into the hash table using * either sta_info_insert() or sta_info_insert_rcu(); only in the latter * case (which acquires an rcu read section but must not be called from * within one) will the pointer still be valid after the call. Note that * the caller may not do much with the STA info before inserting it, in * particular, it may not start any mesh peer link management or add * encryption keys. * * When the insertion fails (sta_info_insert()) returns non-zero), the * structure will have been freed by sta_info_insert()! * * Station entries are added by mac80211 when you establish a link with a * peer. This means different things for the different type of interfaces * we support. For a regular station this mean we add the AP sta when we * receive an association response from the AP. For IBSS this occurs when * get to know about a peer on the same IBSS. For WDS we add the sta for * the peer immediately upon device open. When using AP mode we add stations * for each respective station upon request from userspace through nl80211. * * In order to remove a STA info structure, various sta_info_destroy_*() * calls are available. * * There is no concept of ownership on a STA entry, each structure is * owned by the global hash table/list until it is removed. All users of * the structure need to be RCU protected so that the structure won't be * freed before they are done using it. */ static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; /* Caller must hold local->sta_mtx */ static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_remove(&local->sta_hash, &sta->hash_node, sta_rht_params); } static void __cleanup_single_sta(struct sta_info *sta) { int ac, i; struct tid_ampdu_tx *tid_tx; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_PS_STA); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); clear_sta_flag(sta, WLAN_STA_PS_DELIVER); atomic_dec(&ps->num_sta_ps); } if (sta->sta.txq[0]) { for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi; if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); ieee80211_txq_purge(local, txqi); } } for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]); } if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_sta_cleanup(sta); cancel_work_sync(&sta->drv_deliver_wk); /* * Destroy aggregation state here. It would be nice to wait for the * driver to finish aggregation stop and then clean up, but for now * drivers have to handle aggregation stop being requested, followed * directly by station destruction. */ for (i = 0; i < IEEE80211_NUM_TIDS; i++) { kfree(sta->ampdu_mlme.tid_start_tx[i]); tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]); if (!tid_tx) continue; ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending); kfree(tid_tx); } } static void cleanup_single_sta(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; __cleanup_single_sta(sta); sta_info_free(local, sta); } struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->sta_hash, addr, sta_rht_params); } /* protected by RCU */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } /* * Get sta info either from the specified interface * or from one of its vlans */ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, const u8 *sta_addr, const u8 *vif_addr) { struct rhlist_head *tmp; struct sta_info *sta; for_each_sta_info(local, sta_addr, sta, tmp) { if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) return sta; } return NULL; } struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; int i = 0; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->sta_mtx)) { if (sdata != sta->sdata) continue; if (i < idx) { ++i; continue; } return sta; } return NULL; } /** * sta_info_free - free STA * * @local: pointer to the global information * @sta: STA info to free * * This function must undo everything done by sta_info_alloc() * that may happen before sta_info_insert(). It may only be * called when sta_info_insert() has not been attempted (and * if that fails, the station is freed anyway.) */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { /* * If we had used sta_info_pre_move_state() then we might not * have gone through the state transitions down again, so do * it here now (and warn if it's inserted). * * This will clear state such as fast TX/RX that may have been * allocated during state transitions. */ while (sta->sta_state > IEEE80211_STA_NONE) { int ret; WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); ret = sta_info_move_state(sta, sta->sta_state - 1); if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret)) break; } if (sta->rate_ctrl) rate_control_free_sta(sta); sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); if (sta->sta.txq[0]) kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif free_percpu(sta->pcpu_rx_stats); kfree(sta); } /* Caller must hold local->sta_mtx */ static int sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_insert(&local->sta_hash, &sta->hash_node, sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) { struct sta_info *sta; sta = container_of(wk, struct sta_info, drv_deliver_wk); if (sta->dead) return; local_bh_disable(); if (!test_sta_flag(sta, WLAN_STA_PS_STA)) ieee80211_sta_ps_deliver_wakeup(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL)) ieee80211_sta_ps_deliver_poll_response(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD)) ieee80211_sta_ps_deliver_uapsd(sta); local_bh_enable(); } static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; return 0; } struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp) { struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); if (!sta) return NULL; if (ieee80211_hw_check(hw, USES_RSS)) { sta->pcpu_rx_stats = alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp); if (!sta->pcpu_rx_stats) goto free; } spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); mutex_init(&sta->ampdu_mlme.mtx); #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) { sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); if (!sta->mesh) goto free; sta->mesh->plink_sta = sta; spin_lock_init(&sta->mesh->plink_lock); if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.user_mpm) timer_setup(&sta->mesh->plink_timer, mesh_plink_timer, 0); sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; } #endif memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); sta->sta.max_rx_aggregation_subframes = local->hw.max_rx_aggregation_subframes; /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only. * The Tx path starts to use a key as soon as the key slot ptk_idx * references to is not NULL. To not use the initial Rx-only key * prematurely for Tx initialize ptk_idx to an impossible PTK keyid * which always will refer to a NULL key. */ BUILD_BUG_ON(ARRAY_SIZE(sta->ptk) <= INVALID_PTK_KEYIDX); sta->ptk_idx = INVALID_PTK_KEYIDX; sta->local = local; sta->sdata = sdata; sta->rx_stats.last_rx = jiffies; u64_stats_init(&sta->rx_stats.syncp); ieee80211_init_frag_cache(&sta->frags); sta->sta_state = IEEE80211_STA_NONE; /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; sta->last_connected = ktime_get_seconds(); ewma_signal_init(&sta->rx_stats_avg.signal); ewma_avg_signal_init(&sta->status_stats.avg_ack_signal); for (i = 0; i < ARRAY_SIZE(sta->rx_stats_avg.chain_signal); i++) ewma_signal_init(&sta->rx_stats_avg.chain_signal[i]); if (local->ops->wake_tx_queue) { void *txq_data; int size = sizeof(struct txq_info) + ALIGN(hw->txq_data_size, sizeof(void *)); txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); if (!txq_data) goto free; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; /* might not do anything for the bufferable MMPDU TXQ */ ieee80211_txq_init(sdata, sta, txq, i); } } if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); init_airtime_info(&sta->airtime[i], &local->airtime[i]); } for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); for (i = 0; i < NUM_NL80211_BANDS; i++) { u32 mandatory = 0; int r; if (!hw->wiphy->bands[i]) continue; switch (i) { case NL80211_BAND_2GHZ: /* * We use both here, even if we cannot really know for * sure the station will support both, but the only use * for this is when we don't know anything yet and send * management frames, and then we'll pick the lowest * possible rate anyway. * If we don't include _G here, we cannot find a rate * in P2P, and thus trigger the WARN_ONCE() in rate.c */ mandatory = IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; break; case NL80211_BAND_5GHZ: mandatory = IEEE80211_RATE_MANDATORY_A; break; case NL80211_BAND_60GHZ: WARN_ON(1); mandatory = 0; break; } for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { struct ieee80211_rate *rate; rate = &hw->wiphy->bands[i]->bitrates[r]; if (!(rate->flags & mandatory)) continue; sta->sta.supp_rates[i] |= BIT(r); } } sta->sta.smps_mode = IEEE80211_SMPS_OFF; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { struct ieee80211_supported_band *sband; u8 smps; sband = ieee80211_get_sband(sdata); if (!sband) goto free_txq; smps = (sband->ht_cap.cap & IEEE80211_HT_CAP_SM_PS) >> IEEE80211_HT_CAP_SM_PS_SHIFT; /* * Assume that hostapd advertises our caps in the beacon and * this is the known_smps_mode for a station that just assciated */ switch (smps) { case WLAN_HT_SMPS_CONTROL_DISABLED: sta->known_smps_mode = IEEE80211_SMPS_OFF; break; case WLAN_HT_SMPS_CONTROL_STATIC: sta->known_smps_mode = IEEE80211_SMPS_STATIC; break; case WLAN_HT_SMPS_CONTROL_DYNAMIC: sta->known_smps_mode = IEEE80211_SMPS_DYNAMIC; break; default: WARN_ON(1); } } sta->sta.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA; sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD; sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); return sta; free_txq: if (sta->sta.txq[0]) kfree(to_txq_info(sta->sta.txq[0])); free: free_percpu(sta->pcpu_rx_stats); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif kfree(sta); return NULL; } static int sta_info_insert_check(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; /* * Can't be a WARN_ON because it can be triggered through a race: * something inserts a STA (on one CPU) without holding the RTNL * and another CPU turns off the net device. */ if (unlikely(!ieee80211_sdata_running(sdata))) return -ENETDOWN; if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) || !is_valid_ether_addr(sta->sta.addr))) return -EINVAL; /* The RCU read lock is required by rhashtable due to * asynchronous resize/rehash. We also require the mutex * for correctness. */ rcu_read_lock(); lockdep_assert_held(&sdata->local->sta_mtx); if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) && ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) { rcu_read_unlock(); return -ENOTUNIQ; } rcu_read_unlock(); return 0; } static int sta_info_insert_drv_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { enum ieee80211_sta_state state; int err = 0; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) { err = drv_sta_state(local, sdata, sta, state, state + 1); if (err) break; } if (!err) { /* * Drivers using legacy sta_add/sta_remove callbacks only * get uploaded set to true after sta_add is called. */ if (!local->ops->sta_add) sta->uploaded = true; return 0; } if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { sdata_info(sdata, "failed to move IBSS STA %pM to state %d (%d) - keeping it anyway\n", sta->sta.addr, state + 1, err); err = 0; } /* unwind on error */ for (; state > IEEE80211_STA_NOTEXIST; state--) WARN_ON(drv_sta_state(local, sdata, sta, state, state - 1)); return err; } static void ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; bool allow_p2p_go_ps = sdata->vif.p2p; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || !test_sta_flag(sta, WLAN_STA_ASSOC)) continue; if (!sta->sta.support_p2p_ps) { allow_p2p_go_ps = false; break; } } rcu_read_unlock(); if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_P2P_PS); } } /* * should be called with sta_mtx locked * this function replaces the mutex lock * with a RCU lock */ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo = NULL; int err = 0; lockdep_assert_held(&local->sta_mtx); /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; goto out_cleanup; } sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); if (!sinfo) { err = -ENOMEM; goto out_cleanup; } local->num_sta++; local->sta_generation++; smp_mb(); /* simplify things and don't accept BA sessions yet */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); /* make the station visible */ err = sta_info_hash_add(local, sta); if (err) goto out_drop_sta; list_add_tail_rcu(&sta->list, &local->sta_list); /* notify driver */ err = sta_info_insert_drv_state(local, sdata, sta); if (err) goto out_remove; set_sta_flag(sta, WLAN_STA_INSERTED); if (sta->sta_state >= IEEE80211_STA_ASSOC) { ieee80211_recalc_min_chandef(sta->sdata); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); sinfo->generation = local->sta_generation; cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr); /* move reference to rcu-protected */ rcu_read_lock(); mutex_unlock(&local->sta_mtx); if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_accept_plinks_update(sdata); return 0; out_remove: sta_info_hash_del(local, sta); list_del_rcu(&sta->list); out_drop_sta: local->num_sta--; synchronize_net(); out_cleanup: cleanup_single_sta(sta); mutex_unlock(&local->sta_mtx); kfree(sinfo); rcu_read_lock(); return err; } int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; int err; might_sleep(); mutex_lock(&local->sta_mtx); err = sta_info_insert_check(sta); if (err) { sta_info_free(local, sta); mutex_unlock(&local->sta_mtx); rcu_read_lock(); return err; } return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) { int err = sta_info_insert_rcu(sta); rcu_read_unlock(); return err; } static inline void __bss_tim_set(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __set_bit() format. */ tim[id / 8] |= (1 << (id % 8)); } static inline void __bss_tim_clear(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __clear_bit() format. */ tim[id / 8] &= ~(1 << (id % 8)); } static inline bool __bss_tim_get(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the test_bit() format. */ return tim[id / 8] & (1 << (id % 8)); } static unsigned long ieee80211_tids_for_ac(int ac) { /* If we ever support TIDs > 7, this obviously needs to be adjusted */ switch (ac) { case IEEE80211_AC_VO: return BIT(6) | BIT(7); case IEEE80211_AC_VI: return BIT(4) | BIT(5); case IEEE80211_AC_BE: return BIT(0) | BIT(3); case IEEE80211_AC_BK: return BIT(1) | BIT(2); default: WARN_ON(1); return 0; } } static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) { struct ieee80211_local *local = sta->local; struct ps_data *ps; bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; u16 id = sta->sta.aid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (WARN_ON_ONCE(!sta->sdata->bss)) return; ps = &sta->sdata->bss->ps; #ifdef CONFIG_MAC80211_MESH } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; #endif } else { return; } /* No need to do anything if the driver does all */ if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) goto done; /* * If all ACs are delivery-enabled then we should build * the TIM bit for all ACs anyway; if only some are then * we ignore those and build the TIM bit using only the * non-enabled ones. */ if (ignore_for_tim == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_tim = 0; if (ignore_pending) ignore_for_tim = BIT(IEEE80211_NUM_ACS) - 1; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac]) continue; indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac]); if (indicate_tim) break; tids = ieee80211_tids_for_ac(ac); indicate_tim |= sta->driver_buffered_tids & tids; indicate_tim |= sta->txq_buffered_tids & tids; } done: spin_lock_bh(&local->tim_lock); if (indicate_tim == __bss_tim_get(ps->tim, id)) goto out_unlock; if (indicate_tim) __bss_tim_set(ps->tim, id); else __bss_tim_clear(ps->tim, id); if (local->ops->set_tim && !WARN_ON(sta->dead)) { local->tim_in_locked_section = true; drv_set_tim(local, &sta->sta, indicate_tim); local->tim_in_locked_section = false; } out_unlock: spin_unlock_bh(&local->tim_lock); } void sta_info_recalc_tim(struct sta_info *sta) { __sta_info_recalc_tim(sta, false); } static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info; int timeout; if (!skb) return false; info = IEEE80211_SKB_CB(skb); /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ timeout = (sta->listen_interval * sta->sdata->vif.bss_conf.beacon_int * 32 / 15625) * HZ; if (timeout < STA_TX_BUFFER_EXPIRE) timeout = STA_TX_BUFFER_EXPIRE; return time_after(jiffies, info->control.jiffies + timeout); } static bool sta_info_cleanup_expire_buffered_ac(struct ieee80211_local *local, struct sta_info *sta, int ac) { unsigned long flags; struct sk_buff *skb; /* * First check for frames that should expire on the filtered * queue. Frames here were rejected by the driver and are on * a separate queue to avoid reordering with normal PS-buffered * frames. They also aren't accounted for right now in the * total_ps_buffered counter. */ for (;;) { spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb = skb_peek(&sta->tx_filtered[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->tx_filtered[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); /* * Frames are queued in order, so if this one * hasn't expired yet we can stop testing. If * we actually reached the end of the queue we * also need to stop, of course. */ if (!skb) break; ieee80211_free_txskb(&local->hw, skb); } /* * Now also check the normal PS-buffered queue, this will * only find something if the filtered queue was emptied * since the filtered frames are all before the normal PS * buffered frames. */ for (;;) { spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb = skb_peek(&sta->ps_tx_buf[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->ps_tx_buf[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); /* * frames are queued in order, so if this one * hasn't expired yet (or we reached the end of * the queue) we can stop testing */ if (!skb) break; local->total_ps_buffered--; ps_dbg(sta->sdata, "Buffered frame expired (STA %pM)\n", sta->sta.addr); ieee80211_free_txskb(&local->hw, skb); } /* * Finally, recalculate the TIM bit for this station -- it might * now be clear because the station was too slow to retrieve its * frames. */ sta_info_recalc_tim(sta); /* * Return whether there are any frames still buffered, this is * used to check whether the cleanup timer still needs to run, * if there are no frames we don't need to rearm the timer. */ return !(skb_queue_empty(&sta->ps_tx_buf[ac]) && skb_queue_empty(&sta->tx_filtered[ac])); } static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, struct sta_info *sta) { bool have_buffered = false; int ac; /* This is only necessary for stations on BSS/MBSS interfaces */ if (!sta->sdata->bss && !ieee80211_vif_is_mesh(&sta->sdata->vif)) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) have_buffered |= sta_info_cleanup_expire_buffered_ac(local, sta, ac); return have_buffered; } static int __must_check __sta_info_destroy_part1(struct sta_info *sta) { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; int ret; might_sleep(); if (!sta) return -ENOENT; local = sta->local; sdata = sta->sdata; lockdep_assert_held(&local->sta_mtx); /* * Before removing the station from the driver and * rate control, it might still start new aggregation * sessions -- block that to make sure the tear-down * will be sufficient. */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); /* * Before removing the station from the driver there might be pending * rx frames on RSS queues sent prior to the disassociation - wait for * all such frames to be processed. */ drv_sync_rx_queues(local, sta); ret = sta_info_hash_del(local, sta); if (WARN_ON(ret)) return ret; /* * for TDLS peers, make sure to return to the base channel before * removal. */ if (test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) { drv_tdls_cancel_channel_switch(local, sdata, &sta->sta); clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL); } list_del_rcu(&sta->list); sta->removed = true; drv_sta_pre_rcu_remove(local, sta->sdata, sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && rcu_access_pointer(sdata->u.vlan.sta) == sta) RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); return 0; } static void __sta_info_destroy_part2(struct sta_info *sta) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo; int ret; /* * NOTE: This assumes at least synchronize_net() was done * after _part1 and before _part2! */ might_sleep(); lockdep_assert_held(&local->sta_mtx); if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); WARN_ON_ONCE(ret); } /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); /* disable TIM bit - last chance to tell driver */ __sta_info_recalc_tim(sta, true); sta->dead = true; local->num_sta--; local->sta_generation++; while (sta->sta_state > IEEE80211_STA_NONE) { ret = sta_info_move_state(sta, sta->sta_state - 1); if (ret) { WARN_ON_ONCE(1); break; } } if (sta->uploaded) { ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); WARN_ON_ONCE(ret != 0); } sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); if (sinfo) sta_set_sinfo(sta, sinfo, true); cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); ieee80211_sta_debugfs_remove(sta); ieee80211_destroy_frag_cache(&sta->frags); cleanup_single_sta(sta); } int __must_check __sta_info_destroy(struct sta_info *sta) { int err = __sta_info_destroy_part1(sta); if (err) return err; synchronize_net(); __sta_info_destroy_part2(sta); return 0; } int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; int ret; mutex_lock(&sdata->local->sta_mtx); sta = sta_info_get(sdata, addr); ret = __sta_info_destroy(sta); mutex_unlock(&sdata->local->sta_mtx); return ret; } int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; int ret; mutex_lock(&sdata->local->sta_mtx); sta = sta_info_get_bss(sdata, addr); ret = __sta_info_destroy(sta); mutex_unlock(&sdata->local->sta_mtx); return ret; } static void sta_info_cleanup(struct timer_list *t) { struct ieee80211_local *local = from_timer(local, t, sta_cleanup); struct sta_info *sta; bool timer_needed = false; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) if (sta_info_cleanup_expire_buffered(local, sta)) timer_needed = true; rcu_read_unlock(); if (local->quiescing) return; if (!timer_needed) return; mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } int sta_info_init(struct ieee80211_local *local) { int err; err = rhltable_init(&local->sta_hash, &sta_rht_params); if (err) return err; spin_lock_init(&local->tim_lock); mutex_init(&local->sta_mtx); INIT_LIST_HEAD(&local->sta_list); timer_setup(&local->sta_cleanup, sta_info_cleanup, 0); return 0; } void sta_info_stop(struct ieee80211_local *local) { del_timer_sync(&local->sta_cleanup); rhltable_destroy(&local->sta_hash); } int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; LIST_HEAD(free_list); int ret = 0; might_sleep(); WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP); WARN_ON(vlans && !sdata->bss); mutex_lock(&local->sta_mtx); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { if (sdata == sta->sdata || (vlans && sdata->bss == sta->sdata->bss)) { if (!WARN_ON(__sta_info_destroy_part1(sta))) list_add(&sta->free_list, &free_list); ret++; } } if (!list_empty(&free_list)) { synchronize_net(); list_for_each_entry_safe(sta, tmp, &free_list, free_list) __sta_info_destroy_part2(sta); } mutex_unlock(&local->sta_mtx); return ret; } void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; mutex_lock(&local->sta_mtx); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sdata != sta->sdata) continue; if (time_is_before_jiffies(last_active + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); if (ieee80211_vif_is_mesh(&sdata->vif) && test_sta_flag(sta, WLAN_STA_PS_STA)) atomic_dec(&sdata->u.mesh.ps.num_sta_ps); WARN_ON(__sta_info_destroy(sta)); } } mutex_unlock(&local->sta_mtx); } struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr) { struct ieee80211_local *local = hw_to_local(hw); struct rhlist_head *tmp; struct sta_info *sta; /* * Just return a random station if localaddr is NULL * ... first in list. */ for_each_sta_info(local, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; if (!sta->uploaded) return NULL; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr) { struct sta_info *sta; if (!vif) return NULL; sta = sta_info_get_bss(vif_to_sdata(vif), addr); if (!sta) return NULL; if (!sta->uploaded) return NULL; return &sta->sta; } EXPORT_SYMBOL(ieee80211_find_sta); /* powersave support code */ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff_head pending; int filtered = 0, buffered = 0, ac, i; unsigned long flags; struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_SP); BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1); sta->driver_buffered_tids = 0; sta->txq_buffered_tids = 0; if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i])) continue; schedule_and_wake_txq(local, to_txq_info(sta->sta.txq[i])); } skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ spin_lock(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending); spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); tmp = skb_queue_len(&pending); filtered += tmp - count; count = tmp; spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending); spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); tmp = skb_queue_len(&pending); buffered += tmp - count; } ieee80211_add_pending_skbs(local, &pending); /* now we're no longer in the deliver code */ clear_sta_flag(sta, WLAN_STA_PS_DELIVER); /* The station might have polled and then woken up before we responded, * so clear these flags now to avoid them sticking around. */ clear_sta_flag(sta, WLAN_STA_PSPOLL); clear_sta_flag(sta, WLAN_STA_UAPSD); spin_unlock(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); local->total_ps_buffered -= buffered; sta_info_recalc_tim(sta); ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct sta_info *sta, int tid, enum ieee80211_frame_release_type reason, bool call_driver, bool more_data) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; skb->priority = tid; skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]); if (qos) { nullfunc->qos_ctrl = cpu_to_le16(tid); if (reason == IEEE80211_FRAME_RELEASE_UAPSD) { nullfunc->qos_ctrl |= cpu_to_le16(IEEE80211_QOS_CTL_EOSP); if (more_data) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } } info = IEEE80211_SKB_CB(skb); /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. Also set EOSP to indicate this packet * ends the poll/service period. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; if (call_driver) drv_allow_buffered_frames(local, sta, BIT(tid), 1, reason, false); skb->dev = sdata->dev; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } info->band = chanctx_conf->def.chan->band; ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } static int find_highest_prio_tid(unsigned long tids) { /* lower 3 TIDs aren't ordered perfectly */ if (tids & 0xF8) return fls(tids) - 1; /* TID 0 is BE just like TID 3 */ if (tids & BIT(0)) return 0; return fls(tids) - 1; } /* Indicates if the MORE_DATA bit should be set in the last * frame obtained by ieee80211_sta_ps_get_frames. * Note that driver_release_tids is relevant only if * reason = IEEE80211_FRAME_RELEASE_PSPOLL */ static bool ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs, enum ieee80211_frame_release_type reason, unsigned long driver_release_tids) { int ac; /* If the driver has data on more than one TID then * certainly there's more data if we release just a * single frame now (from a single TID). This will * only happen for PS-Poll. */ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && hweight16(driver_release_tids) > 1) return true; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) return true; } return false; } static void ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason, struct sk_buff_head *frames, unsigned long *driver_release_tids) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ac; /* Get response frame(s) and more data bit for the last one. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; tids = ieee80211_tids_for_ac(ac); /* if we already have frames from software, then we can't also * release from hardware queues */ if (skb_queue_empty(frames)) { *driver_release_tids |= sta->driver_buffered_tids & tids; *driver_release_tids |= sta->txq_buffered_tids & tids; } if (!*driver_release_tids) { struct sk_buff *skb; while (n_frames > 0) { skb = skb_dequeue(&sta->tx_filtered[ac]); if (!skb) { skb = skb_dequeue( &sta->ps_tx_buf[ac]); if (skb) local->total_ps_buffered--; } if (!skb) break; n_frames--; __skb_queue_tail(frames, skb); } } /* If we have more frames buffered on this AC, then abort the * loop since we can't send more data from other ACs before * the buffered frames from this. */ if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) break; } } static void ieee80211_sta_ps_deliver_response(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; unsigned long driver_release_tids = 0; struct sk_buff_head frames; bool more_data; /* Service or PS-Poll period starts */ set_sta_flag(sta, WLAN_STA_SP); __skb_queue_head_init(&frames); ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason, &frames, &driver_release_tids); more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids); if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL) driver_release_tids = BIT(find_highest_prio_tid(driver_release_tids)); if (skb_queue_empty(&frames) && !driver_release_tids) { int tid, ac; /* * For PS-Poll, this can only happen due to a race condition * when we set the TIM bit and the station notices it, but * before it can poll for the frame we expire it. * * For uAPSD, this is said in the standard (11.2.1.5 h): * At each unscheduled SP for a non-AP STA, the AP shall * attempt to transmit at least one MSDU or MMPDU, but no * more than the value specified in the Max SP Length field * in the QoS Capability element from delivery-enabled ACs, * that are destined for the non-AP STA. * * Since we have no other MSDU/MMPDU, transmit a QoS null frame. */ /* This will evaluate to 1, 3, 5 or 7. */ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++) if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac])) break; tid = 7 - 2 * ac; ieee80211_send_null_response(sta, tid, reason, true, false); } else if (!driver_release_tids) { struct sk_buff_head pending; struct sk_buff *skb; int num = 0; u16 tids = 0; bool need_null = false; skb_queue_head_init(&pending); while ((skb = __skb_dequeue(&frames))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qoshdr = NULL; num++; /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; /* * Use MoreData flag to indicate whether there are * more buffered frames for this STA */ if (more_data || !skb_queue_empty(&frames)) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); else hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) qoshdr = ieee80211_get_qos_ctl(hdr); tids |= BIT(skb->priority); __skb_queue_tail(&pending, skb); /* end service period after last frame or add one */ if (!skb_queue_empty(&frames)) continue; if (reason != IEEE80211_FRAME_RELEASE_UAPSD) { /* for PS-Poll, there's only one frame */ info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; break; } /* For uAPSD, things are a bit more complicated. If the * last frame has a QoS header (i.e. is a QoS-data or * QoS-nulldata frame) then just set the EOSP bit there * and be done. * If the frame doesn't have a QoS header (which means * it should be a bufferable MMPDU) then we can't set * the EOSP bit in the QoS header; add a QoS-nulldata * frame to the list to send it after the MMPDU. * * Note that this code is only in the mac80211-release * code path, we assume that the driver will not buffer * anything but QoS-data frames, or if it does, will * create the QoS-nulldata frame by itself if needed. * * Cf. 802.11-2012 10.2.1.10 (c). */ if (qoshdr) { *qoshdr |= IEEE80211_QOS_CTL_EOSP; info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; } else { /* The standard isn't completely clear on this * as it says the more-data bit should be set * if there are more BUs. The QoS-Null frame * we're about to send isn't buffered yet, we * only create it below, but let's pretend it * was buffered just in case some clients only * expect more-data=0 when eosp=1. */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); need_null = true; num++; } break; } drv_allow_buffered_frames(local, sta, tids, num, reason, more_data); ieee80211_add_pending_skbs(local, &pending); if (need_null) ieee80211_send_null_response( sta, find_highest_prio_tid(tids), reason, false, false); sta_info_recalc_tim(sta); } else { int tid; /* * We need to release a frame that is buffered somewhere in the * driver ... it'll have to handle that. * Note that the driver also has to check the number of frames * on the TIDs we're releasing from - if there are more than * n_frames it has to set the more-data bit (if we didn't ask * it to set it anyway due to other buffered frames); if there * are fewer than n_frames it has to make sure to adjust that * to allow the service period to end properly. */ drv_release_buffered_frames(local, sta, driver_release_tids, n_frames, reason, more_data); /* * Note that we don't recalculate the TIM bit here as it would * most likely have no effect at all unless the driver told us * that the TID(s) became empty before returning here from the * release function. * Either way, however, when the driver tells us that the TID(s) * became empty or we find that a txq became empty, we'll do the * TIM recalculation. */ if (!sta->sta.txq[0]) return; for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { if (!sta->sta.txq[tid] || !(driver_release_tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid])) continue; sta_info_recalc_tim(sta); break; } } } void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) { u8 ignore_for_response = sta->sta.uapsd_queues; /* * If all ACs are delivery-enabled then we should reply * from any of them, if only some are enabled we reply * only from the non-enabled ones. */ if (ignore_for_response == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_response = 0; ieee80211_sta_ps_deliver_response(sta, 1, ignore_for_response, IEEE80211_FRAME_RELEASE_PSPOLL); } void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta) { int n_frames = sta->sta.max_sp; u8 delivery_enabled = sta->sta.uapsd_queues; /* * If we ever grow support for TSPEC this might happen if * the TSPEC update from hostapd comes in between a trigger * frame setting WLAN_STA_UAPSD in the RX path and this * actually getting called. */ if (!delivery_enabled) return; switch (sta->sta.max_sp) { case 1: n_frames = 2; break; case 2: n_frames = 4; break; case 3: n_frames = 6; break; case 0: /* XXX: what is a good value? */ n_frames = 128; break; } ieee80211_sta_ps_deliver_response(sta, n_frames, ~delivery_enabled, IEEE80211_FRAME_RELEASE_UAPSD); } void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); trace_api_sta_block_awake(sta->local, pubsta, block); if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_clear_fast_xmit(sta); return; } if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) return; if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { set_sta_flag(sta, WLAN_STA_PS_DELIVER); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) || test_sta_flag(sta, WLAN_STA_UAPSD)) { /* must be asleep in this case */ clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); void ieee80211_sta_eosp(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->local; trace_api_eosp(local, pubsta); clear_sta_flag(sta, WLAN_STA_SP); } EXPORT_SYMBOL(ieee80211_sta_eosp); void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); enum ieee80211_frame_release_type reason; bool more_data; trace_api_send_eosp_nullfunc(sta->local, pubsta, tid); reason = IEEE80211_FRAME_RELEASE_UAPSD; more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues, reason, 0); ieee80211_send_null_response(sta, tid, reason, false, more_data); } EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc); void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, u8 tid, bool buffered) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered); if (buffered) set_bit(tid, &sta->driver_buffered_tids); else clear_bit(tid, &sta->driver_buffered_tids); sta_info_recalc_tim(sta); } EXPORT_SYMBOL(ieee80211_sta_set_buffered); void ieee80211_register_airtime(struct ieee80211_txq *txq, u32 tx_airtime, u32 rx_airtime) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); struct ieee80211_local *local = sdata->local; u64 weight_sum, weight_sum_reciprocal; struct airtime_sched_info *air_sched; struct airtime_info *air_info; u32 airtime = 0; air_sched = &local->airtime[txq->ac]; air_info = to_airtime_info(txq); if (local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; if (local->airtime_flags & AIRTIME_USE_RX) airtime += rx_airtime; /* Weights scale so the unit weight is 256 */ airtime <<= 8; spin_lock_bh(&air_sched->lock); air_info->tx_airtime += tx_airtime; air_info->rx_airtime += rx_airtime; if (air_sched->weight_sum) { weight_sum = air_sched->weight_sum; weight_sum_reciprocal = air_sched->weight_sum_reciprocal; } else { weight_sum = air_info->weight; weight_sum_reciprocal = air_info->weight_reciprocal; } /* Round the calculation of global vt */ air_sched->v_t += (u64)((airtime + (weight_sum >> 1)) * weight_sum_reciprocal) >> IEEE80211_RECIPROCAL_SHIFT_64; air_info->v_t += (u32)((airtime + (air_info->weight >> 1)) * air_info->weight_reciprocal) >> IEEE80211_RECIPROCAL_SHIFT_32; ieee80211_resort_txq(&local->hw, txq); spin_unlock_bh(&air_sched->lock); } void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime) { struct ieee80211_txq *txq = pubsta->txq[tid]; if (!txq) return; ieee80211_register_airtime(txq, tx_airtime, rx_airtime); } EXPORT_SYMBOL(ieee80211_sta_register_airtime); void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed) { int tx_pending; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return; if (!tx_completed) { if (sta) atomic_add(tx_airtime, &sta->airtime[ac].aql_tx_pending); atomic_add(tx_airtime, &local->aql_total_pending_airtime); return; } if (sta) { tx_pending = atomic_sub_return(tx_airtime, &sta->airtime[ac].aql_tx_pending); if (tx_pending < 0) atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, tx_pending, 0); } tx_pending = atomic_sub_return(tx_airtime, &local->aql_total_pending_airtime); if (WARN_ONCE(tx_pending < 0, "Device %s AC %d pending airtime underflow: %u, %u", wiphy_name(local->hw.wiphy), ac, tx_pending, tx_airtime)) atomic_cmpxchg(&local->aql_total_pending_airtime, tx_pending, 0); } int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { might_sleep(); if (sta->sta_state == new_state) return 0; /* check allowed transitions first */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state != IEEE80211_STA_AUTH) return -EINVAL; break; case IEEE80211_STA_AUTH: if (sta->sta_state != IEEE80211_STA_NONE && sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; case IEEE80211_STA_ASSOC: if (sta->sta_state != IEEE80211_STA_AUTH && sta->sta_state != IEEE80211_STA_AUTHORIZED) return -EINVAL; break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; default: WARN(1, "invalid state %d", new_state); return -EINVAL; } sta_dbg(sta->sdata, "moving STA %pM to state %d\n", sta->sta.addr, new_state); /* * notify the driver before the actual changes so it can * fail the transition */ if (test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); if (err) return err; } /* reflect the change in all state variables */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state == IEEE80211_STA_AUTH) clear_bit(WLAN_STA_AUTH, &sta->_flags); break; case IEEE80211_STA_AUTH: if (sta->sta_state == IEEE80211_STA_NONE) { set_bit(WLAN_STA_AUTH, &sta->_flags); } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); ieee80211_recalc_min_chandef(sta->sdata); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); sta->assoc_at = ktime_get_boottime_ns(); ieee80211_recalc_min_chandef(sta->sdata); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ieee80211_vif_dec_num_mcast(sta->sdata); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_clear_fast_xmit(sta); ieee80211_clear_fast_rx(sta); } break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state == IEEE80211_STA_ASSOC) { ieee80211_vif_inc_num_mcast(sta->sdata); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_check_fast_xmit(sta); ieee80211_check_fast_rx(sta); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || sta->sdata->vif.type == NL80211_IFTYPE_AP) cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); break; default: break; } sta->sta_state = new_state; return 0; } u8 sta_info_tx_streams(struct sta_info *sta) { struct ieee80211_sta_ht_cap *ht_cap = &sta->sta.ht_cap; u8 rx_streams; if (!sta->sta.ht_cap.ht_supported) return 1; if (sta->sta.vht_cap.vht_supported) { int i; u16 tx_mcs_map = le16_to_cpu(sta->sta.vht_cap.vht_mcs.tx_mcs_map); for (i = 7; i >= 0; i--) if ((tx_mcs_map & (0x3 << (i * 2))) != IEEE80211_VHT_MCS_NOT_SUPPORTED) return i + 1; } if (ht_cap->mcs.rx_mask[3]) rx_streams = 4; else if (ht_cap->mcs.rx_mask[2]) rx_streams = 3; else if (ht_cap->mcs.rx_mask[1]) rx_streams = 2; else rx_streams = 1; if (!(ht_cap->mcs.tx_params & IEEE80211_HT_MCS_TX_RX_DIFF)) return rx_streams; return ((ht_cap->mcs.tx_params & IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK) >> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1; } static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = &sta->rx_stats; int cpu; if (!sta->pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; cpustats = per_cpu_ptr(sta->pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; } return stats; } static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); switch (STA_STATS_GET(TYPE, rate)) { case STA_STATS_RATE_TYPE_VHT: rinfo->flags = RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = STA_STATS_GET(VHT_MCS, rate); rinfo->nss = STA_STATS_GET(VHT_NSS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_HT: rinfo->flags = RATE_INFO_FLAGS_MCS; rinfo->mcs = STA_STATS_GET(HT_MCS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_LEGACY: { struct ieee80211_supported_band *sband; u16 brate; unsigned int shift; int band = STA_STATS_GET(LEGACY_BAND, rate); int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband->bitrates)) break; brate = sband->bitrates[rate_idx].bitrate; if (rinfo->bw == RATE_INFO_BW_5) shift = 2; else if (rinfo->bw == RATE_INFO_BW_10) shift = 1; else shift = 0; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } case STA_STATS_RATE_TYPE_HE: rinfo->flags = RATE_INFO_FLAGS_HE_MCS; rinfo->mcs = STA_STATS_GET(HE_MCS, rate); rinfo->nss = STA_STATS_GET(HE_NSS, rate); rinfo->he_gi = STA_STATS_GET(HE_GI, rate); rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); break; } } static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; sta_stats_decode_rate(sta->local, rate, rinfo); return 0; } static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, int tid) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->msdu[tid]; } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, int tid) { struct ieee80211_local *local = sta->local; int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->rx_stats, tid); if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); } } tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); tidstats->tx_msdu = sta->tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); tidstats->tx_msdu_retries = sta->status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = sta->status_stats.msdu_failed[tid]; } if (local->ops->wake_tx_queue && tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); } } static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->bytes; } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta); sinfo->generation = sdata->local->sta_generation; /* do before driver, so beacon filtering drivers have a * chance to e.g. just add the number of filtered beacons * (or just modify the value entirely, of course) */ if (sdata->vif.type == NL80211_IFTYPE_STATION) sinfo->rx_beacon = sdata->u.mgd.count_beacon_signal; drv_sta_statistics(local, sdata, &sta->sta, sinfo); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) | BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->tx_stats.bytes[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->tx_stats.packets[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->rx_stats.packets; if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); sinfo->rx_packets += cpurxs->packets; } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->status_stats.retry_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->status_stats.retry_failed; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->rx_duration += sta->airtime[ac].rx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_duration += sta->airtime[ac].tx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { sinfo->airtime_weight = sta->airtime[0].weight; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } sinfo->rx_dropped_misc = sta->rx_stats.dropped; if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); sinfo->rx_dropped_misc += cpurxs->dropped; } } if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->pcpu_rx_stats && !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->rx_stats_avg.signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } /* for the average - if pcpu_rx_stats isn't set - rxstats must point to * the sta->rx_stats struct, so the check here is fine with and without * pcpu statistics */ if (last_rxstats->chains && !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->pcpu_rx_stats) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { sinfo->chain_signal[i] = last_rxstats->chain_signal_last[i]; sinfo->chain_signal_avg[i] = -ewma_signal_read(&sta->rx_stats_avg.chain_signal[i]); } } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &sinfo->txrate); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) sta_set_tidstats(sta, &sinfo->pertid[i], i); } if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | BIT_ULL(NL80211_STA_INFO_PLID) | BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; sinfo->connected_to_as = sta->mesh->connected_to_as; #endif } sinfo->bss_param.flags = 0; if (sdata->vif.bss_conf.use_cts_prot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; if (sdata->vif.bss_conf.use_short_preamble) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (sdata->vif.bss_conf.use_short_slot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period; sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; sinfo->sta_flags.set = 0; sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) | BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | BIT(NL80211_STA_FLAG_WME) | BIT(NL80211_STA_FLAG_MFP) | BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED) | BIT(NL80211_STA_FLAG_TDLS_PEER); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); if (sta->sta.wme) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); if (test_sta_flag(sta, WLAN_STA_MFP)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); if (test_sta_flag(sta, WLAN_STA_AUTH)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); if (test_sta_flag(sta, WLAN_STA_ASSOC)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); thr = sta_get_expected_throughput(sta); if (thr != 0) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && sta->status_stats.ack_signal_filled) { sinfo->ack_signal = sta->status_stats.last_ack_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && sta->status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &sta->status_stats.avg_ack_signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } if (ieee80211_vif_is_mesh(&sdata->vif)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); sinfo->airtime_link_metric = airtime_link_metric_get(local, sta); } } u32 sta_get_expected_throughput(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; u32 thr = 0; if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) ref = local->rate_ctrl; /* check if the driver has a SW RC implementation */ if (ref && ref->ops->get_expected_throughput) thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); else thr = drv_get_expected_throughput(local, sta); return thr; } unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); if (!sta->status_stats.last_ack || time_after(stats->last_rx, sta->status_stats.last_ack)) return stats->last_rx; return sta->status_stats.last_ack; } static void sta_update_codel_params(struct sta_info *sta, u32 thr) { if (!sta->sdata->local->ops->wake_tx_queue) return; if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { sta->cparams.target = MS2TIME(50); sta->cparams.interval = MS2TIME(300); sta->cparams.ecn = false; } else { sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; } } void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, u32 thr) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); sta_update_codel_params(sta, thr); }
1085 1084 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de> */ #include <linux/module.h> #include <net/6lowpan.h> #include <net/addrconf.h> #include "6lowpan_i.h" int lowpan_register_netdevice(struct net_device *dev, enum lowpan_lltypes lltype) { int i, ret; switch (lltype) { case LOWPAN_LLTYPE_IEEE802154: dev->addr_len = EUI64_ADDR_LEN; break; case LOWPAN_LLTYPE_BTLE: dev->addr_len = ETH_ALEN; break; } dev->type = ARPHRD_6LOWPAN; dev->mtu = IPV6_MIN_MTU; lowpan_dev(dev)->lltype = lltype; spin_lock_init(&lowpan_dev(dev)->ctx.lock); for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) lowpan_dev(dev)->ctx.table[i].id = i; dev->ndisc_ops = &lowpan_ndisc_ops; ret = register_netdevice(dev); if (ret < 0) return ret; lowpan_dev_debugfs_init(dev); return ret; } EXPORT_SYMBOL(lowpan_register_netdevice); int lowpan_register_netdev(struct net_device *dev, enum lowpan_lltypes lltype) { int ret; rtnl_lock(); ret = lowpan_register_netdevice(dev, lltype); rtnl_unlock(); return ret; } EXPORT_SYMBOL(lowpan_register_netdev); void lowpan_unregister_netdevice(struct net_device *dev) { unregister_netdevice(dev); lowpan_dev_debugfs_exit(dev); } EXPORT_SYMBOL(lowpan_unregister_netdevice); void lowpan_unregister_netdev(struct net_device *dev) { rtnl_lock(); lowpan_unregister_netdevice(dev); rtnl_unlock(); } EXPORT_SYMBOL(lowpan_unregister_netdev); int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) { struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; /* Set short_addr autoconfiguration if short_addr is present only */ if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) return -1; /* For either address format, all zero addresses MUST NOT be used */ if (wpan_dev->pan_id == cpu_to_le16(0x0000) && wpan_dev->short_addr == cpu_to_le16(0x0000)) return -1; /* Alternatively, if no PAN ID is known, 16 zero bits may be used */ if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) memset(eui, 0, 2); else ieee802154_le16_to_be16(eui, &wpan_dev->pan_id); /* The "Universal/Local" (U/L) bit shall be set to zero */ eui[0] &= ~2; eui[2] = 0; eui[3] = 0xFF; eui[4] = 0xFE; eui[5] = 0; ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr); return 0; } static int lowpan_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev; struct in6_addr addr; int i; if (dev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; idev = __in6_dev_get(dev); if (!idev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: case NETDEV_CHANGE: /* (802.15.4 6LoWPAN short address slaac handling */ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) { __ipv6_addr_set_half(&addr.s6_addr32[0], htonl(0xFE800000), 0); addrconf_add_linklocal(idev, &addr, 0); } break; case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &lowpan_dev(dev)->ctx.table[i].flags); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block lowpan_notifier = { .notifier_call = lowpan_event, }; static int __init lowpan_module_init(void) { int ret; lowpan_debugfs_init(); ret = register_netdevice_notifier(&lowpan_notifier); if (ret < 0) { lowpan_debugfs_exit(); return ret; } request_module_nowait("nhc_dest"); request_module_nowait("nhc_fragment"); request_module_nowait("nhc_hop"); request_module_nowait("nhc_ipv6"); request_module_nowait("nhc_mobility"); request_module_nowait("nhc_routing"); request_module_nowait("nhc_udp"); return 0; } static void __exit lowpan_module_exit(void) { lowpan_debugfs_exit(); unregister_netdevice_notifier(&lowpan_notifier); } module_init(lowpan_module_init); module_exit(lowpan_module_exit); MODULE_LICENSE("GPL");
19 25 3 4 19 6 2 5 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 // SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> #include "internal.h" struct callchain_cpus_entries { struct rcu_head rcu_head; struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + sizeof(__u64) * (sysctl_perf_event_max_stack + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } static void release_callchain_buffers_rcu(struct rcu_head *head) { struct callchain_cpus_entries *entries; int cpu; entries = container_of(head, struct callchain_cpus_entries, rcu_head); for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); } static void release_callchain_buffers(void) { struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } static int alloc_callchain_buffers(void) { int cpu; int size; struct callchain_cpus_entries *entries; /* * We can't use the percpu allocation API for data that can be * accessed from NMI. Use a temporary manual per cpu allocation * until that gets sorted out. */ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); entries = kzalloc(size, GFP_KERNEL); if (!entries) return -ENOMEM; size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!entries->cpu_entries[cpu]) goto fail; } rcu_assign_pointer(callchain_cpus_entries, entries); return 0; fail: for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); return -ENOMEM; } int get_callchain_buffers(int event_max_stack) { int err = 0; int count; mutex_lock(&callchain_mutex); count = atomic_inc_return(&nr_callchain_events); if (WARN_ON_ONCE(count < 1)) { err = -EINVAL; goto exit; } /* * If requesting per event more than the global cap, * return a different error to help userspace figure * this out. * * And also do it here so that we have &callchain_mutex held. */ if (event_max_stack > sysctl_perf_event_max_stack) { err = -EOVERFLOW; goto exit; } if (count == 1) err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); mutex_unlock(&callchain_mutex); return err; } void put_callchain_buffers(void) { if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { release_callchain_buffers(); mutex_unlock(&callchain_mutex); } } struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; entries = rcu_dereference(callchain_cpus_entries); if (!entries) { put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; } cpu = smp_processor_id(); return (((void *)entries->cpu_entries[cpu]) + (*rctx * perf_callchain_entry__sizeof())); } void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; ctx.entry = entry; ctx.max_stack = max_stack; ctx.nr = entry->nr = init_nr; ctx.contexts = 0; ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); perf_callchain_kernel(&ctx, regs); } if (user) { if (!user_mode(regs)) { if (current->mm) regs = task_pt_regs(current); else regs = NULL; } if (regs) { mm_segment_t fs; if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); force_uaccess_end(fs); } } exit_put: put_callchain_entry(rctx); return entry; } /* * Used for sysctl_perf_event_max_stack and * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); if (ret || !write) return ret; mutex_lock(&callchain_mutex); if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else *value = new_value; mutex_unlock(&callchain_mutex); return ret; }
141 141 141 53 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Forwarding Information Base. * * Authors: A.N.Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #ifndef _NET_IP_FIB_H #define _NET_IP_FIB_H #include <net/flow.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/fib_notifier.h> #include <net/fib_rules.h> #include <net/inetpeer.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/refcount.h> struct fib_config { u8 fc_dst_len; u8 fc_tos; u8 fc_protocol; u8 fc_scope; u8 fc_type; u8 fc_gw_family; /* 2 bytes unused */ u32 fc_table; __be32 fc_dst; union { __be32 fc_gw4; struct in6_addr fc_gw6; }; int fc_oif; u32 fc_flags; u32 fc_priority; __be32 fc_prefsrc; u32 fc_nh_id; struct nlattr *fc_mx; struct rtnexthop *fc_mp; int fc_mx_len; int fc_mp_len; u32 fc_flow; u32 fc_nlflags; struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; }; struct fib_info; struct rtable; struct fib_nh_exception { struct fib_nh_exception __rcu *fnhe_next; int fnhe_genid; __be32 fnhe_daddr; u32 fnhe_pmtu; bool fnhe_mtu_locked; __be32 fnhe_gw; unsigned long fnhe_expires; struct rtable __rcu *fnhe_rth_input; struct rtable __rcu *fnhe_rth_output; unsigned long fnhe_stamp; struct rcu_head rcu; }; struct fnhe_hash_bucket { struct fib_nh_exception __rcu *chain; }; #define FNHE_HASH_SHIFT 11 #define FNHE_HASH_SIZE (1 << FNHE_HASH_SHIFT) #define FNHE_RECLAIM_DEPTH 5 struct fib_nh_common { struct net_device *nhc_dev; int nhc_oif; unsigned char nhc_scope; u8 nhc_family; u8 nhc_gw_family; unsigned char nhc_flags; struct lwtunnel_state *nhc_lwtstate; union { __be32 ipv4; struct in6_addr ipv6; } nhc_gw; int nhc_weight; atomic_t nhc_upper_bound; /* v4 specific, but allows fib6_nh with v4 routes */ struct rtable __rcu * __percpu *nhc_pcpu_rth_output; struct rtable __rcu *nhc_rth_input; struct fnhe_hash_bucket __rcu *nhc_exceptions; }; struct fib_nh { struct fib_nh_common nh_common; struct hlist_node nh_hash; struct fib_info *nh_parent; #ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif __be32 nh_saddr; int nh_saddr_genid; #define fib_nh_family nh_common.nhc_family #define fib_nh_dev nh_common.nhc_dev #define fib_nh_oif nh_common.nhc_oif #define fib_nh_flags nh_common.nhc_flags #define fib_nh_lws nh_common.nhc_lwtstate #define fib_nh_scope nh_common.nhc_scope #define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw4 nh_common.nhc_gw.ipv4 #define fib_nh_gw6 nh_common.nhc_gw.ipv6 #define fib_nh_weight nh_common.nhc_weight #define fib_nh_upper_bound nh_common.nhc_upper_bound }; /* * This structure contains data shared by many of routes. */ struct nexthop; struct fib_info { struct hlist_node fib_hash; struct hlist_node fib_lhash; struct list_head nh_list; struct net *fib_net; refcount_t fib_treeref; refcount_t fib_clntref; unsigned int fib_flags; unsigned char fib_dead; unsigned char fib_protocol; unsigned char fib_scope; unsigned char fib_type; __be32 fib_prefsrc; u32 fib_tb_id; u32 fib_priority; struct dst_metrics *fib_metrics; #define fib_mtu fib_metrics->metrics[RTAX_MTU-1] #define fib_window fib_metrics->metrics[RTAX_WINDOW-1] #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; bool fib_nh_is_v6; bool nh_updated; struct nexthop *nh; struct rcu_head rcu; struct fib_nh fib_nh[]; }; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; #endif struct fib_table; struct fib_result { __be32 prefix; unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; u32 tclassid; struct fib_nh_common *nhc; struct fib_info *fi; struct fib_table *table; struct hlist_head *fa_head; }; struct fib_result_nl { __be32 fl_addr; /* To be looked up*/ u32 fl_mark; unsigned char fl_tos; unsigned char fl_scope; unsigned char tb_id_in; unsigned char tb_id; /* Results */ unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; int err; }; #ifdef CONFIG_IP_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 #else #define FIB_TABLE_HASHSZ 2 #endif __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope); __be32 fib_result_prefsrc(struct net *net, struct fib_result *res); #define FIB_RES_NHC(res) ((res).nhc) #define FIB_RES_DEV(res) (FIB_RES_NHC(res)->nhc_dev) #define FIB_RES_OIF(res) (FIB_RES_NHC(res)->nhc_oif) struct fib_rt_info { struct fib_info *fi; u32 tb_id; __be32 dst; int dst_len; u8 tos; u8 type; u8 offload:1, trap:1, offload_failed:1, unused:5; }; struct fib_entry_notifier_info { struct fib_notifier_info info; /* must be first */ u32 dst; int dst_len; struct fib_info *fi; u8 tos; u8 type; u32 tb_id; }; struct fib_nh_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_nh *fib_nh; }; int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); int fib_notify(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); struct fib_table { struct hlist_node tb_hlist; u32 tb_id; int tb_num_default; struct rcu_head rcu; unsigned long *tb_data; unsigned long __data[]; }; struct fib_dump_filter { u32 table_id; /* filter_set is an optimization that an entry is set */ bool filter_set; bool dump_routes; bool dump_exceptions; unsigned char protocol; unsigned char rt_type; unsigned int flags; struct net_device *dev; }; int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags); int fib_table_insert(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); int fib_table_delete(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter); int fib_table_flush(struct net *net, struct fib_table *table, bool flush_all); struct fib_table *fib_trie_unmerge(struct fib_table *main_tb); void fib_table_flush_external(struct fib_table *table); void fib_free_table(struct fib_table *tb); #ifndef CONFIG_IP_MULTIPLE_TABLES #define TABLE_LOCAL_INDEX (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1)) #define TABLE_MAIN_INDEX (RT_TABLE_MAIN & (FIB_TABLE_HASHSZ - 1)) static inline struct fib_table *fib_get_table(struct net *net, u32 id) { struct hlist_node *tb_hlist; struct hlist_head *ptr; ptr = id == RT_TABLE_LOCAL ? &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] : &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]; tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr)); return hlist_entry(tb_hlist, struct fib_table, tb_hlist); } static inline struct fib_table *fib_new_table(struct net *net, u32 id) { return fib_get_table(net, id); } static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; rcu_read_lock(); tb = fib_get_table(net, RT_TABLE_MAIN); if (tb) err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); if (err == -EAGAIN) err = -ENETUNREACH; rcu_read_unlock(); return err; } static inline bool fib4_has_custom_rules(const struct net *net) { return false; } static inline bool fib4_rule_default(const struct fib_rule *rule) { return true; } static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static inline unsigned int fib4_rules_seq_read(struct net *net) { return 0; } static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi4 *fl4, struct flow_keys *flkeys) { return false; } #else /* CONFIG_IP_MULTIPLE_TABLES */ int __net_init fib4_rules_init(struct net *net); void __net_exit fib4_rules_exit(struct net *net); struct fib_table *fib_new_table(struct net *net, u32 id); struct fib_table *fib_get_table(struct net *net, u32 id); int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags); static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; flags |= FIB_LOOKUP_NOREF; if (net->ipv4.fib_has_custom_rules) return __fib_lookup(net, flp, res, flags); rcu_read_lock(); res->tclassid = 0; tb = rcu_dereference_rtnl(net->ipv4.fib_main); if (tb) err = fib_table_lookup(tb, flp, res, flags); if (!err) goto out; tb = rcu_dereference_rtnl(net->ipv4.fib_default); if (tb) err = fib_table_lookup(tb, flp, res, flags); out: if (err == -EAGAIN) err = -ENETUNREACH; rcu_read_unlock(); return err; } static inline bool fib4_has_custom_rules(const struct net *net) { return net->ipv4.fib_has_custom_rules; } bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); unsigned int fib4_rules_seq_read(struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi4 *fl4, struct flow_keys *flkeys) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; if (!net->ipv4.fib_rules_require_fldissect) return false; skb_flow_dissect_flow_keys(skb, flkeys, flag); fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; fl4->flowi4_proto = flkeys->basic.ip_proto; return true; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ /* Exported by fib_frontend.c */ extern const struct nla_policy rtm_ipv4_policy[]; void ip_fib_init(void); int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack); __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { return atomic_read(&net->ipv4.fib_num_tclassid_users); } #else static inline int fib_num_tclassid_users(struct net *net) { return 0; } #endif int fib_unmerge(struct net *net); static inline bool nhc_l3mdev_matches_dev(const struct fib_nh_common *nhc, const struct net_device *dev) { if (nhc->nhc_dev == dev || l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) return true; return false; } /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned char nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig); /* Fields used for sysctl_fib_multipath_hash_fields. * Common to IPv4 and IPv6. * * Add new fields at the end. This is user API. */ #define FIB_MULTIPATH_HASH_FIELD_SRC_IP BIT(0) #define FIB_MULTIPATH_HASH_FIELD_DST_IP BIT(1) #define FIB_MULTIPATH_HASH_FIELD_IP_PROTO BIT(2) #define FIB_MULTIPATH_HASH_FIELD_FLOWLABEL BIT(3) #define FIB_MULTIPATH_HASH_FIELD_SRC_PORT BIT(4) #define FIB_MULTIPATH_HASH_FIELD_DST_PORT BIT(5) #define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP BIT(6) #define FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP BIT(7) #define FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO BIT(8) #define FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL BIT(9) #define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT BIT(10) #define FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT BIT(11) #define FIB_MULTIPATH_HASH_FIELD_OUTER_MASK \ (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ FIB_MULTIPATH_HASH_FIELD_DST_IP | \ FIB_MULTIPATH_HASH_FIELD_IP_PROTO | \ FIB_MULTIPATH_HASH_FIELD_FLOWLABEL | \ FIB_MULTIPATH_HASH_FIELD_SRC_PORT | \ FIB_MULTIPATH_HASH_FIELD_DST_PORT) #define FIB_MULTIPATH_HASH_FIELD_INNER_MASK \ (FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP | \ FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP | \ FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO | \ FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL | \ FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT | \ FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) #define FIB_MULTIPATH_HASH_FIELD_ALL_MASK \ (FIB_MULTIPATH_HASH_FIELD_OUTER_MASK | \ FIB_MULTIPATH_HASH_FIELD_INNER_MASK) #define FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK \ (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ FIB_MULTIPATH_HASH_FIELD_DST_IP | \ FIB_MULTIPATH_HASH_FIELD_IP_PROTO) #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); #endif int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); void fib_select_multipath(struct fib_result *res, int hash); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); int fib_nh_init(struct net *net, struct fib_nh *fib_nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack); void fib_nh_release(struct net *net, struct fib_nh *fib_nh); int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, struct nlattr *fc_encap, u16 fc_encap_type, void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib_nh_common_release(struct fib_nh_common *nhc); /* Exported by fib_trie.c */ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri); void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, const struct flowi4 *flp); static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) { #ifdef CONFIG_IP_ROUTE_CLASSID struct fib_nh_common *nhc = res->nhc; #ifdef CONFIG_IP_MULTIPLE_TABLES u32 rtag; #endif if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); *itag = nh->nh_tclassid << 16; } else { *itag = 0; } #ifdef CONFIG_IP_MULTIPLE_TABLES rtag = res->tclassid; if (*itag == 0) *itag = (rtag<<16); *itag |= (rtag>>16); #endif #endif } void fib_flush(struct net *net); void free_fib_info(struct fib_info *fi); static inline void fib_info_hold(struct fib_info *fi) { refcount_inc(&fi->fib_clntref); } static inline void fib_info_put(struct fib_info *fi) { if (refcount_dec_and_test(&fi->fib_clntref)) free_fib_info(fi); } #ifdef CONFIG_PROC_FS int __net_init fib_proc_init(struct net *net); void __net_exit fib_proc_exit(struct net *net); #else static inline int fib_proc_init(struct net *net) { return 0; } static inline void fib_proc_exit(struct net *net) { } #endif u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb); int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, u8 rt_family, unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, int nh_weight, u8 rt_family, u32 nh_tclassid); #endif /* _NET_FIB_H */
148 147 148 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/route.h> #include <linux/ip.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables mangle table"); #define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ (1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, }; static unsigned int ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) { unsigned int ret; const struct iphdr *iph; u_int8_t tos; __be32 saddr, daddr; u_int32_t mark; int err; /* Save things which could affect route */ mark = skb->mark; iph = ip_hdr(skb); saddr = iph->saddr; daddr = iph->daddr; tos = iph->tos; ret = ipt_do_table(skb, state, priv); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } } return ret; } /* The work comes in here from netfilter.c. */ static unsigned int iptable_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, state, priv); return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *mangle_ops __read_mostly; static int iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; int ret; repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { ipt_unregister_table_pre_exit(net, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) { ipt_unregister_table_exit(net, "mangle"); } static struct pernet_operations iptable_mangle_net_ops = { .pre_exit = iptable_mangle_net_pre_exit, .exit = iptable_mangle_net_exit, }; static int __init iptable_mangle_init(void) { int ret = xt_register_template(&packet_mangler, iptable_mangle_table_init); if (ret < 0) return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); if (IS_ERR(mangle_ops)) { xt_unregister_template(&packet_mangler); ret = PTR_ERR(mangle_ops); return ret; } ret = register_pernet_subsys(&iptable_mangle_net_ops); if (ret < 0) { xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } return ret; } static void __exit iptable_mangle_fini(void) { unregister_pernet_subsys(&iptable_mangle_net_ops); xt_unregister_template(&packet_mangler); kfree(mangle_ops); } module_init(iptable_mangle_init); module_exit(iptable_mangle_fini);
15 15 15 32 32 15 32 32 29 29 29 29 29 15 15 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 15 15 15 15 15 15 15 15 12 12 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 // SPDX-License-Identifier: GPL-2.0-or-later /* Keyring handling * * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/err.h> #include <linux/user_namespace.h> #include <linux/nsproxy.h> #include <keys/keyring-type.h> #include <keys/user-type.h> #include <linux/assoc_array_priv.h> #include <linux/uaccess.h> #include <net/net_namespace.h> #include "internal.h" /* * When plumbing the depths of the key tree, this sets a hard limit * set on how deep we're willing to go. */ #define KEYRING_SEARCH_MAX_DEPTH 6 /* * We mark pointers we pass to the associative array with bit 1 set if * they're keyrings and clear otherwise. */ #define KEYRING_PTR_SUBTYPE 0x2UL static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x) { return (unsigned long)x & KEYRING_PTR_SUBTYPE; } static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x) { void *object = assoc_array_ptr_to_leaf(x); return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE); } static inline void *keyring_key_to_ptr(struct key *key) { if (key->type == &key_type_keyring) return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE); return key; } static DEFINE_RWLOCK(keyring_name_lock); /* * Clean up the bits of user_namespace that belong to us. */ void key_free_user_ns(struct user_namespace *ns) { write_lock(&keyring_name_lock); list_del_init(&ns->keyring_name_list); write_unlock(&keyring_name_lock); key_put(ns->user_keyring_register); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif } /* * The keyring key type definition. Keyrings are simply keys of this type and * can be treated as ordinary keys in addition to having their own special * operations. */ static int keyring_preparse(struct key_preparsed_payload *prep); static void keyring_free_preparse(struct key_preparsed_payload *prep); static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); static long keyring_read(const struct key *keyring, char __user *buffer, size_t buflen); struct key_type key_type_keyring = { .name = "keyring", .def_datalen = 0, .preparse = keyring_preparse, .free_preparse = keyring_free_preparse, .instantiate = keyring_instantiate, .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, .read = keyring_read, }; EXPORT_SYMBOL(key_type_keyring); /* * Semaphore to serialise link/link calls to prevent two link calls in parallel * introducing a cycle. */ static DEFINE_MUTEX(keyring_serialise_link_lock); /* * Publish the name of a keyring so that it can be found by name (if it has * one and it doesn't begin with a dot). */ static void keyring_publish_name(struct key *keyring) { struct user_namespace *ns = current_user_ns(); if (keyring->description && keyring->description[0] && keyring->description[0] != '.') { write_lock(&keyring_name_lock); list_add_tail(&keyring->name_link, &ns->keyring_name_list); write_unlock(&keyring_name_lock); } } /* * Preparse a keyring payload */ static int keyring_preparse(struct key_preparsed_payload *prep) { return prep->datalen != 0 ? -EINVAL : 0; } /* * Free a preparse of a user defined key payload */ static void keyring_free_preparse(struct key_preparsed_payload *prep) { } /* * Initialise a keyring. * * Returns 0 on success, -EINVAL if given any data. */ static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep) { assoc_array_init(&keyring->keys); /* make the keyring available by name if it has one */ keyring_publish_name(keyring); return 0; } /* * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit. Ideally we'd * fold the carry back too, but that requires inline asm. */ static u64 mult_64x32_and_fold(u64 x, u32 y) { u64 hi = (u64)(u32)(x >> 32) * y; u64 lo = (u64)(u32)(x) * y; return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32); } /* * Hash a key type and description. */ static void hash_key_type_and_desc(struct keyring_index_key *index_key) { const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP; const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK; const char *description = index_key->description; unsigned long hash, type; u32 piece; u64 acc; int n, desc_len = index_key->desc_len; type = (unsigned long)index_key->type; acc = mult_64x32_and_fold(type, desc_len + 13); acc = mult_64x32_and_fold(acc, 9207); piece = (unsigned long)index_key->domain_tag; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); for (;;) { n = desc_len; if (n <= 0) break; if (n > 4) n = 4; piece = 0; memcpy(&piece, description, n); description += n; desc_len -= n; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); } /* Fold the hash down to 32 bits if need be. */ hash = acc; if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32) hash ^= acc >> 32; /* Squidge all the keyrings into a separate part of the tree to * ordinary keys by making sure the lowest level segment in the hash is * zero for keyrings and non-zero otherwise. */ if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0) hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1; else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0) hash = (hash + (hash << level_shift)) & ~fan_mask; index_key->hash = hash; } /* * Finalise an index key to include a part of the description actually in the * index key, to set the domain tag and to calculate the hash. */ void key_set_index_key(struct keyring_index_key *index_key) { static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), }; size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc)); memcpy(index_key->desc, index_key->description, n); if (!index_key->domain_tag) { if (index_key->type->flags & KEY_TYPE_NET_DOMAIN) index_key->domain_tag = current->nsproxy->net_ns->key_domain; else index_key->domain_tag = &default_domain_tag; } hash_key_type_and_desc(index_key); } /** * key_put_tag - Release a ref on a tag. * @tag: The tag to release. * * This releases a reference the given tag and returns true if that ref was the * last one. */ bool key_put_tag(struct key_tag *tag) { if (refcount_dec_and_test(&tag->usage)) { kfree_rcu(tag, rcu); return true; } return false; } /** * key_remove_domain - Kill off a key domain and gc its keys * @domain_tag: The domain tag to release. * * This marks a domain tag as being dead and releases a ref on it. If that * wasn't the last reference, the garbage collector is poked to try and delete * all keys that were in the domain. */ void key_remove_domain(struct key_tag *domain_tag) { domain_tag->removed = true; if (!key_put_tag(domain_tag)) key_schedule_gc_links(); } /* * Build the next index key chunk. * * We return it one word-sized chunk at a time. */ static unsigned long keyring_get_key_chunk(const void *data, int level) { const struct keyring_index_key *index_key = data; unsigned long chunk = 0; const u8 *d; int desc_len = index_key->desc_len, n = sizeof(chunk); level /= ASSOC_ARRAY_KEY_CHUNK_SIZE; switch (level) { case 0: return index_key->hash; case 1: return index_key->x; case 2: return (unsigned long)index_key->type; case 3: return (unsigned long)index_key->domain_tag; default: level -= 4; if (desc_len <= sizeof(index_key->desc)) return 0; d = index_key->description + sizeof(index_key->desc); d += level * sizeof(long); desc_len -= sizeof(index_key->desc); if (desc_len > n) desc_len = n; do { chunk <<= 8; chunk |= *d++; } while (--desc_len > 0); return chunk; } } static unsigned long keyring_get_object_key_chunk(const void *object, int level) { const struct key *key = keyring_ptr_to_key(object); return keyring_get_key_chunk(&key->index_key, level); } static bool keyring_compare_object(const void *object, const void *data) { const struct keyring_index_key *index_key = data; const struct key *key = keyring_ptr_to_key(object); return key->index_key.type == index_key->type && key->index_key.domain_tag == index_key->domain_tag && key->index_key.desc_len == index_key->desc_len && memcmp(key->index_key.description, index_key->description, index_key->desc_len) == 0; } /* * Compare the index keys of a pair of objects and determine the bit position * at which they differ - if they differ. */ static int keyring_diff_objects(const void *object, const void *data) { const struct key *key_a = keyring_ptr_to_key(object); const struct keyring_index_key *a = &key_a->index_key; const struct keyring_index_key *b = data; unsigned long seg_a, seg_b; int level, i; level = 0; seg_a = a->hash; seg_b = b->hash; if ((seg_a ^ seg_b) != 0) goto differ; level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8; /* The number of bits contributed by the hash is controlled by a * constant in the assoc_array headers. Everything else thereafter we * can deal with as being machine word-size dependent. */ seg_a = a->x; seg_b = b->x; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); /* The next bit may not work on big endian */ seg_a = (unsigned long)a->type; seg_b = (unsigned long)b->type; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); seg_a = (unsigned long)a->domain_tag; seg_b = (unsigned long)b->domain_tag; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); i = sizeof(a->desc); if (a->desc_len <= i) goto same; for (; i < a->desc_len; i++) { seg_a = *(unsigned char *)(a->description + i); seg_b = *(unsigned char *)(b->description + i); if ((seg_a ^ seg_b) != 0) goto differ_plus_i; } same: return -1; differ_plus_i: level += i; differ: i = level * 8 + __ffs(seg_a ^ seg_b); return i; } /* * Free an object after stripping the keyring flag off of the pointer. */ static void keyring_free_object(void *object) { key_put(keyring_ptr_to_key(object)); } /* * Operations for keyring management by the index-tree routines. */ static const struct assoc_array_ops keyring_assoc_array_ops = { .get_key_chunk = keyring_get_key_chunk, .get_object_key_chunk = keyring_get_object_key_chunk, .compare_object = keyring_compare_object, .diff_objects = keyring_diff_objects, .free_object = keyring_free_object, }; /* * Clean up a keyring when it is destroyed. Unpublish its name if it had one * and dispose of its data. * * The garbage collector detects the final key_put(), removes the keyring from * the serial number tree and then does RCU synchronisation before coming here, * so we shouldn't need to worry about code poking around here with the RCU * readlock held by this time. */ static void keyring_destroy(struct key *keyring) { if (keyring->description) { write_lock(&keyring_name_lock); if (keyring->name_link.next != NULL && !list_empty(&keyring->name_link)) list_del(&keyring->name_link); write_unlock(&keyring_name_lock); } if (keyring->restrict_link) { struct key_restriction *keyres = keyring->restrict_link; key_put(keyres->key); kfree(keyres); } assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops); } /* * Describe a keyring for /proc. */ static void keyring_describe(const struct key *keyring, struct seq_file *m) { if (keyring->description) seq_puts(m, keyring->description); else seq_puts(m, "[anon]"); if (key_is_positive(keyring)) { if (keyring->keys.nr_leaves_on_tree != 0) seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else seq_puts(m, ": empty"); } } struct keyring_read_iterator_context { size_t buflen; size_t count; key_serial_t *buffer; }; static int keyring_read_iterator(const void *object, void *data) { struct keyring_read_iterator_context *ctx = data; const struct key *key = keyring_ptr_to_key(object); kenter("{%s,%d},,{%zu/%zu}", key->type->name, key->serial, ctx->count, ctx->buflen); if (ctx->count >= ctx->buflen) return 1; *ctx->buffer++ = key->serial; ctx->count += sizeof(key->serial); return 0; } /* * Read a list of key IDs from the keyring's contents in binary form * * The keyring's semaphore is read-locked by the caller. This prevents someone * from modifying it under us - which could cause us to read key IDs multiple * times. */ static long keyring_read(const struct key *keyring, char *buffer, size_t buflen) { struct keyring_read_iterator_context ctx; long ret; kenter("{%d},,%zu", key_serial(keyring), buflen); if (buflen & (sizeof(key_serial_t) - 1)) return -EINVAL; /* Copy as many key IDs as fit into the buffer */ if (buffer && buflen) { ctx.buffer = (key_serial_t *)buffer; ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { kleave(" = %ld [iterate]", ret); return ret; } } /* Return the size of the buffer needed */ ret = keyring->keys.nr_leaves_on_tree * sizeof(key_serial_t); if (ret <= buflen) kleave("= %ld [ok]", ret); else kleave("= %ld [buffer too small]", ret); return ret; } /* * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, uid, gid, cred, perm, flags, restrict_link); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { key_put(keyring); keyring = ERR_PTR(ret); } } return keyring; } EXPORT_SYMBOL(keyring_alloc); /** * restrict_link_reject - Give -EPERM to restrict link * @keyring: The keyring being added to. * @type: The type of key being added. * @payload: The payload of the key intended to be added. * @restriction_key: Keys providing additional data for evaluating restriction. * * Reject the addition of any links to a keyring. It can be overridden by * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when * adding a key to a keyring. * * This is meant to be stored in a key_restriction structure which is passed * in the restrict_link parameter to keyring_alloc(). */ int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return -EPERM; } /* * By default, we keys found by getting an exact match on their descriptions. */ bool key_default_cmp(const struct key *key, const struct key_match_data *match_data) { return strcmp(key->description, match_data->raw_data) == 0; } /* * Iteration function to consider each key found. */ static int keyring_search_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); unsigned long kflags = READ_ONCE(key->flags); short state = READ_ONCE(key->state); kenter("{%d}", key->serial); /* ignore keys not of this type */ if (key->type != ctx->index_key.type) { kleave(" = 0 [!type]"); return 0; } /* skip invalidated, revoked and expired keys */ if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { time64_t expiry = READ_ONCE(key->expiry); if (kflags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { ctx->result = ERR_PTR(-EKEYREVOKED); kleave(" = %d [invrev]", ctx->skipped_ret); goto skipped; } if (expiry && ctx->now >= expiry) { if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED)) ctx->result = ERR_PTR(-EKEYEXPIRED); kleave(" = %d [expire]", ctx->skipped_ret); goto skipped; } } /* keys that don't match */ if (!ctx->match_data.cmp(key, &ctx->match_data)) { kleave(" = 0 [!match]"); return 0; } /* key must have search permissions */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) { ctx->result = ERR_PTR(-EACCES); kleave(" = %d [!perm]", ctx->skipped_ret); goto skipped; } if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { /* we set a different error code if we pass a negative key */ if (state < 0) { ctx->result = ERR_PTR(state); kleave(" = %d [neg]", ctx->skipped_ret); goto skipped; } } /* Found */ ctx->result = make_key_ref(key, ctx->possessed); kleave(" = 1 [found]"); return 1; skipped: return ctx->skipped_ret; } /* * Search inside a keyring for a key. We can search by walking to it * directly based on its index-key or we can iterate over the entire * tree looking for it, based on the match function. */ static int search_keyring(struct key *keyring, struct keyring_search_context *ctx) { if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_DIRECT) { const void *object; object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, &ctx->index_key); return object ? ctx->iterator(object, ctx) : 0; } return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx); } /* * Search a tree of keyrings that point to other keyrings up to the maximum * depth. */ static bool search_nested_keyrings(struct key *keyring, struct keyring_search_context *ctx) { struct { struct key *keyring; struct assoc_array_node *node; int slot; } stack[KEYRING_SEARCH_MAX_DEPTH]; struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; struct key *key; int sp = 0, slot; kenter("{%d},{%s,%s}", keyring->serial, ctx->index_key.type->name, ctx->index_key.description); #define STATE_CHECKS (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_DO_STATE_CHECK) BUG_ON((ctx->flags & STATE_CHECKS) == 0 || (ctx->flags & STATE_CHECKS) == STATE_CHECKS); if (ctx->index_key.description) key_set_index_key(&ctx->index_key); /* Check to see if this top-level keyring is what we are looking for * and whether it is valid or not. */ if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_ITERATE || keyring_compare_object(keyring, &ctx->index_key)) { ctx->skipped_ret = 2; switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) { case 1: goto found; case 2: return false; default: break; } } ctx->skipped_ret = 0; /* Start processing a new keyring */ descend_to_keyring: kdebug("descend to %d", keyring->serial); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto not_this_keyring; /* Search through the keys in this keyring before its searching its * subtrees. */ if (search_keyring(keyring, ctx)) goto found; /* Then manually iterate through the keyrings nested in this one. * * Start from the root node of the index tree. Because of the way the * hash function has been set up, keyrings cluster on the leftmost * branch of the root node (root slot 0) or in the root node itself. * Non-keyrings avoid the leftmost branch of the root entirely (root * slots 1-15). */ if (!(ctx->flags & KEYRING_SEARCH_RECURSE)) goto not_this_keyring; ptr = READ_ONCE(keyring->keys.root); if (!ptr) goto not_this_keyring; if (assoc_array_ptr_is_shortcut(ptr)) { /* If the root is a shortcut, either the keyring only contains * keyring pointers (everything clusters behind root slot 0) or * doesn't contain any keyring pointers. */ shortcut = assoc_array_ptr_to_shortcut(ptr); if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) goto not_this_keyring; ptr = READ_ONCE(shortcut->next_node); node = assoc_array_ptr_to_node(ptr); goto begin_node; } node = assoc_array_ptr_to_node(ptr); ptr = node->slots[0]; if (!assoc_array_ptr_is_meta(ptr)) goto begin_node; descend_to_node: /* Descend to a more distal node in this keyring's content tree and go * through that. */ kdebug("descend"); if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->next_node); BUG_ON(!assoc_array_ptr_is_node(ptr)); } node = assoc_array_ptr_to_node(ptr); begin_node: kdebug("begin_node"); slot = 0; ascend_to_node: /* Go through the slots in a node */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); if (assoc_array_ptr_is_meta(ptr) && node->back_pointer) goto descend_to_node; if (!keyring_ptr_is_keyring(ptr)) continue; key = keyring_ptr_to_key(ptr); if (sp >= KEYRING_SEARCH_MAX_DEPTH) { if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) { ctx->result = ERR_PTR(-ELOOP); return false; } goto not_this_keyring; } /* Search a nested keyring */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) continue; /* stack the current position */ stack[sp].keyring = keyring; stack[sp].node = node; stack[sp].slot = slot; sp++; /* begin again with the new keyring */ keyring = key; goto descend_to_keyring; } /* We've dealt with all the slots in the current node, so now we need * to ascend to the parent and continue processing there. */ ptr = READ_ONCE(node->back_pointer); slot = node->parent_slot; if (ptr && assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->back_pointer); slot = shortcut->parent_slot; } if (!ptr) goto not_this_keyring; node = assoc_array_ptr_to_node(ptr); slot++; /* If we've ascended to the root (zero backpointer), we must have just * finished processing the leftmost branch rather than the root slots - * so there can't be any more keyrings for us to find. */ if (node->back_pointer) { kdebug("ascend %d", slot); goto ascend_to_node; } /* The keyring we're looking at was disqualified or didn't contain a * matching key. */ not_this_keyring: kdebug("not_this_keyring %d", sp); if (sp <= 0) { kleave(" = false"); return false; } /* Resume the processing of a keyring higher up in the tree */ sp--; keyring = stack[sp].keyring; node = stack[sp].node; slot = stack[sp].slot + 1; kdebug("ascend to %d [%d]", keyring->serial, slot); goto ascend_to_node; /* We found a viable match */ found: key = key_ref_to_ptr(ctx->result); key_check(key); if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) { key->last_used_at = ctx->now; keyring->last_used_at = ctx->now; while (sp > 0) stack[--sp].keyring->last_used_at = ctx->now; } kleave(" = true"); return true; } /** * keyring_search_rcu - Search a keyring tree for a matching key under RCU * @keyring_ref: A pointer to the keyring with possession indicator. * @ctx: The keyring search context. * * Search the supplied keyring tree for a key that matches the criteria given. * The root keyring and any linked keyrings must grant Search permission to the * caller to be searchable and keys can only be found if they too grant Search * to the caller. The possession flag on the root keyring pointer controls use * of the possessor bits in permissions checking of the entire tree. In * addition, the LSM gets to forbid keyring searches and key matches. * * The search is performed as a breadth-then-depth search up to the prescribed * limit (KEYRING_SEARCH_MAX_DEPTH). The caller must hold the RCU read lock to * prevent keyrings from being destroyed or rearranged whilst they are being * searched. * * Keys are matched to the type provided and are then filtered by the match * function, which is given the description to use in any way it sees fit. The * match function may use any attributes of a key that it wishes to * determine the match. Normally the match function from the key type would be * used. * * RCU can be used to prevent the keyring key lists from disappearing without * the need to take lots of locks. * * Returns a pointer to the found key and increments the key usage count if * successful; -EAGAIN if no matching keys were found, or if expired or revoked * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the * specified keyring wasn't a keyring. * * In the case of a successful return, the possession attribute from * @keyring_ref is propagated to the returned key reference. */ key_ref_t keyring_search_rcu(key_ref_t keyring_ref, struct keyring_search_context *ctx) { struct key *keyring; long err; ctx->iterator = keyring_search_iterator; ctx->possessed = is_key_possessed(keyring_ref); ctx->result = ERR_PTR(-EAGAIN); keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return ERR_PTR(-ENOTDIR); if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) { err = key_task_permission(keyring_ref, ctx->cred, KEY_NEED_SEARCH); if (err < 0) return ERR_PTR(err); } ctx->now = ktime_get_real_seconds(); if (search_nested_keyrings(keyring, ctx)) __key_get(key_ref_to_ptr(ctx->result)); return ctx->result; } /** * keyring_search - Search the supplied keyring tree for a matching key * @keyring: The root of the keyring tree to be searched. * @type: The type of keyring we want to find. * @description: The name of the keyring we want to find. * @recurse: True to search the children of @keyring also * * As keyring_search_rcu() above, but using the current task's credentials and * type's default matching function and preferred search method. */ key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; key_ref_t key; int ret; if (recurse) ctx.flags |= KEYRING_SEARCH_RECURSE; if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) return ERR_PTR(ret); } rcu_read_lock(); key = keyring_search_rcu(keyring, &ctx); rcu_read_unlock(); if (type->match_free) type->match_free(&ctx.match_data); return key; } EXPORT_SYMBOL(keyring_search); static struct key_restriction *keyring_restriction_alloc( key_restrict_link_func_t check) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; return keyres; } /* * Semaphore to serialise restriction setup to prevent reference count * cycles through restriction key pointers. */ static DECLARE_RWSEM(keyring_serialise_restrict_sem); /* * Check for restriction cycles that would prevent keyring garbage collection. * keyring_serialise_restrict_sem must be held. */ static bool keyring_detect_restriction_cycle(const struct key *dest_keyring, struct key_restriction *keyres) { while (keyres && keyres->key && keyres->key->type == &key_type_keyring) { if (keyres->key == dest_keyring) return true; keyres = keyres->key->restrict_link; } return false; } /** * keyring_restrict - Look up and apply a restriction to a keyring * @keyring_ref: The keyring to be restricted * @type: The key type that will provide the restriction checker. * @restriction: The restriction options to apply to the keyring * * Look up a keyring and apply a restriction to it. The restriction is managed * by the specific key type, but can be configured by the options specified in * the restriction string. */ int keyring_restrict(key_ref_t keyring_ref, const char *type, const char *restriction) { struct key *keyring; struct key_type *restrict_type = NULL; struct key_restriction *restrict_link; int ret = 0; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return -ENOTDIR; if (!type) { restrict_link = keyring_restriction_alloc(restrict_link_reject); } else { restrict_type = key_type_lookup(type); if (IS_ERR(restrict_type)) return PTR_ERR(restrict_type); if (!restrict_type->lookup_restriction) { ret = -ENOENT; goto error; } restrict_link = restrict_type->lookup_restriction(restriction); } if (IS_ERR(restrict_link)) { ret = PTR_ERR(restrict_link); goto error; } down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); if (keyring->restrict_link) { ret = -EEXIST; } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; } else { keyring->restrict_link = restrict_link; notify_key(keyring, NOTIFY_KEY_SETATTR, 0); } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); if (ret < 0) { key_put(restrict_link->key); kfree(restrict_link); } error: if (restrict_type) key_type_put(restrict_type); return ret; } EXPORT_SYMBOL(keyring_restrict); /* * Search the given keyring for a key that might be updated. * * The caller must guarantee that the keyring is a keyring and that the * permission is granted to modify the keyring as no check is made here. The * caller must also hold a lock on the keyring semaphore. * * Returns a pointer to the found key with usage count incremented if * successful and returns NULL if not found. Revoked and invalidated keys are * skipped over. * * If successful, the possession indicator is propagated from the keyring ref * to the returned key reference. */ key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key) { struct key *keyring, *key; const void *object; keyring = key_ref_to_ptr(keyring_ref); kenter("{%d},{%s,%s}", keyring->serial, index_key->type->name, index_key->description); object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, index_key); if (object) goto found; kleave(" = NULL"); return NULL; found: key = keyring_ptr_to_key(object); if (key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { kleave(" = NULL [x]"); return NULL; } __key_get(key); kleave(" = {%d}", key->serial); return make_key_ref(key, is_key_possessed(keyring_ref)); } /* * Find a keyring with the specified name. * * Only keyrings that have nonzero refcount, are not revoked, and are owned by a * user in the current user namespace are considered. If @uid_keyring is %true, * the keyring additionally must have been allocated as a user or user session * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct user_namespace *ns = current_user_ns(); struct key *keyring; if (!name) return ERR_PTR(-EINVAL); read_lock(&keyring_name_lock); /* Search this hash bucket for a keyring with a matching name that * grants Search permission and that hasn't been revoked */ list_for_each_entry(keyring, &ns->keyring_name_list, name_link) { if (!kuid_has_mapping(ns, keyring->user->uid)) continue; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) continue; if (strcmp(keyring->description, name) != 0) continue; if (uid_keyring) { if (!test_bit(KEY_FLAG_UID_KEYRING, &keyring->flags)) continue; } else { if (key_permission(make_key_ref(keyring, 0), KEY_NEED_SEARCH) < 0) continue; } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' * (ie. it has a zero usage count) */ if (!refcount_inc_not_zero(&keyring->usage)) continue; keyring->last_used_at = ktime_get_real_seconds(); goto out; } keyring = ERR_PTR(-ENOKEY); out: read_unlock(&keyring_name_lock); return keyring; } static int keyring_detect_cycle_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); kenter("{%d}", key->serial); /* We might get a keyring with matching index-key that is nonetheless a * different keyring. */ if (key != ctx->match_data.raw_data) return 0; ctx->result = ERR_PTR(-EDEADLK); return 1; } /* * See if a cycle will be created by inserting acyclic tree B in acyclic * tree A at the topmost level (ie: as a direct child of A). * * Since we are adding B to A at the top level, checking for cycles should just * be a matter of seeing if node A is somewhere in tree B. */ static int keyring_detect_cycle(struct key *A, struct key *B) { struct keyring_search_context ctx = { .index_key = A->index_key, .match_data.raw_data = A, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .iterator = keyring_detect_cycle_iterator, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_NO_UPDATE_TIME | KEYRING_SEARCH_NO_CHECK_PERM | KEYRING_SEARCH_DETECT_TOO_DEEP | KEYRING_SEARCH_RECURSE), }; rcu_read_lock(); search_nested_keyrings(B, &ctx); rcu_read_unlock(); return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result); } /* * Lock keyring for link. */ int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key) __acquires(&keyring->sem) __acquires(&keyring_serialise_link_lock) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Lock keyrings for move (link/unlink combination). */ int __key_move_lock(struct key *l_keyring, struct key *u_keyring, const struct keyring_index_key *index_key) __acquires(&l_keyring->sem) __acquires(&u_keyring->sem) __acquires(&keyring_serialise_link_lock) { if (l_keyring->type != &key_type_keyring || u_keyring->type != &key_type_keyring) return -ENOTDIR; /* We have to be very careful here to take the keyring locks in the * right order, lest we open ourselves to deadlocking against another * move operation. */ if (l_keyring < u_keyring) { down_write(&l_keyring->sem); down_write_nested(&u_keyring->sem, 1); } else { down_write(&u_keyring->sem); down_write_nested(&l_keyring->sem, 1); } /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Preallocate memory so that a key can be linked into to a keyring. */ int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; int ret; kenter("%d,%s,%s,", keyring->serial, index_key->type->name, index_key->description); BUG_ON(index_key->desc_len == 0); BUG_ON(*_edit != NULL); *_edit = NULL; ret = -EKEYREVOKED; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) goto error; /* Create an edit script that will insert/replace the key in the * keyring tree. */ edit = assoc_array_insert(&keyring->keys, &keyring_assoc_array_ops, index_key, NULL); if (IS_ERR(edit)) { ret = PTR_ERR(edit); goto error; } /* If we're not replacing a link in-place then we're going to need some * extra quota. */ if (!edit->dead_leaf) { ret = key_payload_reserve(keyring, keyring->datalen + KEYQUOTA_LINK_BYTES); if (ret < 0) goto error_cancel; } *_edit = edit; kleave(" = 0"); return 0; error_cancel: assoc_array_cancel_edit(edit); error: kleave(" = %d", ret); return ret; } /* * Check already instantiated keys aren't going to be a problem. * * The caller must have called __key_link_begin(). Don't need to call this for * keys that were created since __key_link_begin() was called. */ int __key_link_check_live_key(struct key *keyring, struct key *key) { if (key->type == &key_type_keyring) /* check that we aren't going to create a cycle by linking one * keyring to another */ return keyring_detect_cycle(keyring, key); return 0; } /* * Link a key into to a keyring. * * Must be called with __key_link_begin() having being called. Discards any * already extant link to matching key if there is one, so that each keyring * holds at most one link to any given key of a particular type+description * combination. */ void __key_link(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* * Finish linking a key into to a keyring. * * Must be called with __key_link_begin() having being called. */ void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit) __releases(&keyring->sem) __releases(&keyring_serialise_link_lock) { BUG_ON(index_key->type == NULL); kenter("%d,%s,", keyring->serial, index_key->type->name); if (edit) { if (!edit->dead_leaf) { key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } assoc_array_cancel_edit(edit); } up_write(&keyring->sem); if (index_key->type == &key_type_keyring) mutex_unlock(&keyring_serialise_link_lock); } /* * Check addition of keys to restricted keyrings. */ static int __key_link_check_restriction(struct key *keyring, struct key *key) { if (!keyring->restrict_link || !keyring->restrict_link->check) return 0; return keyring->restrict_link->check(keyring, key->type, &key->payload, keyring->restrict_link->key); } /** * key_link - Link a key to a keyring * @keyring: The keyring to make the link in. * @key: The key to link to. * * Make a link in a keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring. * * This function will write-lock the keyring's semaphore and will consume some * of the user's key data quota to hold the link. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is * full, -EDQUOT if there is insufficient key data quota remaining to add * another link or -ENOMEM if there's insufficient memory. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_link(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage)); key_check(keyring); key_check(key); ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_end; kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage)); ret = __key_link_check_restriction(keyring, key); if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); error: kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage)); return ret; } EXPORT_SYMBOL(key_link); /* * Lock a keyring for unlink. */ static int __key_unlink_lock(struct key *keyring) __acquires(&keyring->sem) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); return 0; } /* * Begin the process of unlinking a key from a keyring. */ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) return PTR_ERR(edit); if (!edit) return -ENOENT; *_edit = edit; return 0; } /* * Apply an unlink change. */ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } /* * Finish unlinking a key from to a keyring. */ static void __key_unlink_end(struct key *keyring, struct key *key, struct assoc_array_edit *edit) __releases(&keyring->sem) { if (edit) assoc_array_cancel_edit(edit); up_write(&keyring->sem); } /** * key_unlink - Unlink the first link to a key from a keyring. * @keyring: The keyring to remove the link from. * @key: The key the link is to. * * Remove a link from a keyring to a key. * * This function will write-lock the keyring's semaphore. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if * the key isn't linked to by the keyring or -ENOMEM if there's insufficient * memory. * * It is assumed that the caller has checked that it is permitted for a link to * be removed (the keyring should have Write permission; no permissions are * required on the key). */ int key_unlink(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; key_check(keyring); key_check(key); ret = __key_unlink_lock(keyring); if (ret < 0) return ret; ret = __key_unlink_begin(keyring, key, &edit); if (ret == 0) __key_unlink(keyring, key, &edit); __key_unlink_end(keyring, key, edit); return ret; } EXPORT_SYMBOL(key_unlink); /** * key_move - Move a key from one keyring to another * @key: The key to move * @from_keyring: The keyring to remove the link from. * @to_keyring: The keyring to make the link in. * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL. * * Make a link in @to_keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring * whilst simultaneously removing a link to the key from @from_keyring. * * This function will write-lock both keyring's semaphores and will consume * some of the user's key data quota to hold the link on @to_keyring. * * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring, * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second * keyring is full, -EDQUOT if there is insufficient key data quota remaining * to add another link or -ENOMEM if there's insufficient memory. If * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a * matching key in @to_keyring. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags) { struct assoc_array_edit *from_edit = NULL, *to_edit = NULL; int ret; kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial); if (from_keyring == to_keyring) return 0; key_check(key); key_check(from_keyring); key_check(to_keyring); ret = __key_move_lock(from_keyring, to_keyring, &key->index_key); if (ret < 0) goto out; ret = __key_unlink_begin(from_keyring, key, &from_edit); if (ret < 0) goto error; ret = __key_link_begin(to_keyring, &key->index_key, &to_edit); if (ret < 0) goto error; ret = -EEXIST; if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL)) goto error; ret = __key_link_check_restriction(to_keyring, key); if (ret < 0) goto error; ret = __key_link_check_live_key(to_keyring, key); if (ret < 0) goto error; __key_unlink(from_keyring, key, &from_edit); __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); out: kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(key_move); /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. * * Clear the contents of the specified keyring. * * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring. */ int keyring_clear(struct key *keyring) { struct assoc_array_edit *edit; int ret; if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (IS_ERR(edit)) { ret = PTR_ERR(edit); } else { if (edit) assoc_array_apply_edit(edit); notify_key(keyring, NOTIFY_KEY_CLEARED, 0); key_payload_reserve(keyring, 0); ret = 0; } up_write(&keyring->sem); return ret; } EXPORT_SYMBOL(keyring_clear); /* * Dispose of the links from a revoked keyring. * * This is called with the key sem write-locked. */ static void keyring_revoke(struct key *keyring) { struct assoc_array_edit *edit; edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (!IS_ERR(edit)) { if (edit) assoc_array_apply_edit(edit); key_payload_reserve(keyring, 0); } } static bool keyring_gc_select_iterator(void *object, void *iterator_data) { struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; if (key_is_dead(key, *limit)) return false; key_get(key); return true; } static int keyring_gc_check_iterator(const void *object, void *iterator_data) { const struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; key_check(key); return key_is_dead(key, *limit); } /* * Garbage collect pointers from a keyring. * * Not called with any locks held. The keyring's key struct will not be * deallocated under us as only our caller may deallocate it. */ void keyring_gc(struct key *keyring, time64_t limit) { int result; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto dont_gc; /* scan the keyring looking for dead keys */ rcu_read_lock(); result = assoc_array_iterate(&keyring->keys, keyring_gc_check_iterator, &limit); rcu_read_unlock(); if (result == true) goto do_gc; dont_gc: kleave(" [no gc]"); return; do_gc: down_write(&keyring->sem); assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops, keyring_gc_select_iterator, &limit); up_write(&keyring->sem); kleave(" [gc]"); } /* * Garbage collect restriction pointers from a keyring. * * Keyring restrictions are associated with a key type, and must be cleaned * up if the key type is unregistered. The restriction is altered to always * reject additional keys so a keyring cannot be opened up by unregistering * a key type. * * Not called with any keyring locks held. The keyring's key struct will not * be deallocated under us as only our caller may deallocate it. * * The caller is required to hold key_types_sem and dead_type->sem. This is * fulfilled by key_gc_keytype() holding the locks on behalf of * key_garbage_collector(), which it invokes on a workqueue. */ void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type) { struct key_restriction *keyres; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); /* * keyring->restrict_link is only assigned at key allocation time * or with the key type locked, so the only values that could be * concurrently assigned to keyring->restrict_link are for key * types other than dead_type. Given this, it's ok to check * the key type before acquiring keyring->sem. */ if (!dead_type || !keyring->restrict_link || keyring->restrict_link->keytype != dead_type) { kleave(" [no restriction gc]"); return; } /* Lock the keyring to ensure that a link is not in progress */ down_write(&keyring->sem); keyres = keyring->restrict_link; keyres->check = restrict_link_reject; key_put(keyres->key); keyres->key = NULL; keyres->keytype = NULL; up_write(&keyring->sem); kleave(" [restriction gc]"); }
338 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H #include <linux/sched/coredump.h> #include <linux/mm_types.h> #include <linux/fs.h> /* only for vma_is_dax() */ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); void huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } #endif vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pmd - insert a pmd size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pud - insert a pud size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pud size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_NEVER_DAX, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, }; struct kobject; struct kobj_attribute; ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) #define HPAGE_PUD_SHIFT PUD_SHIFT #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) extern unsigned long transparent_hugepage_flags; static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR)) return false; } if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { /* Explicitly disabled through madvise. */ if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; return true; } /* * to be used on vmas which are known to support THP. * Use transparent_hugepage_active otherwise */ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { /* * If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) return false; if (!transhuge_vma_enabled(vma, vma->vm_flags)) return false; if (vma_is_temporary_stack(vma)) return false; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; if (vma_is_dax(vma)) return true; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) return !!(vma->vm_flags & VM_HUGEPAGE); return false; } bool transparent_hugepage_active(struct vm_area_struct *vma); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void prep_transhuge_page(struct page *page); void free_transhuge_page(struct page *page); bool is_transparent_hugepage(struct page *page); bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ } while (0) void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ if (pud_trans_huge(*____pud) \ || pud_devmap(*____pud)) \ __split_huge_pud(__vma, __pud, __address); \ } while (0) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); static inline int is_swap_pmd(pmd_t pmd) { return !pmd_none(pmd) && !pmd_present(pmd); } /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { if (pud_trans_huge(*pud) || pud_devmap(*pud)) return __pud_trans_huge_lock(pud, vma); else return NULL; } /** * thp_head - Head page of a transparent huge page. * @page: Any page (tail, head or regular) found in the page cache. */ static inline struct page *thp_head(struct page *page) { return compound_head(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_ORDER; return 0; } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_NR; return 1; } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap); vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); extern struct page *huge_zero_page; extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_page(struct page *page) { return READ_ONCE(huge_zero_page) == page; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); } static inline bool is_huge_zero_pud(pud_t pud) { return false; } struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } static inline struct list_head *page_deferred_list(struct page *page) { /* * Global or memcg deferred list in the second tail pages is * occupied by compound_head. */ return &page[2].deferred_list; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) static inline struct page *thp_head(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return page; } static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 0; } static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 1; } static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { return false; } static inline bool transparent_hugepage_active(struct vm_area_struct *vma) { return false; } static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { return false; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { return false; } static inline void prep_transhuge_page(struct page *page) {} static inline bool is_transparent_hugepage(struct page *page) { return false; } #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL static inline bool can_split_huge_page(struct page *page, int *pextra_pins) { BUILD_BUG(); return false; } static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { return 0; } static inline int split_huge_page(struct page *page) { return 0; } static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { BUG(); return 0; } static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { } static inline int is_swap_pmd(pmd_t pmd) { return 0; } static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { return NULL; } static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { return 0; } static inline bool is_huge_zero_page(struct page *page) { return false; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return false; } static inline bool is_huge_zero_pud(pud_t pud) { return false; } static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline bool thp_migration_supported(void) { return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #endif /* _LINUX_HUGE_MM_H */
4 7 6 7 8 1 6 1 1 1 5 5 561 561 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* af_can.c - Protocol family CAN core module * (used by different CAN protocol modules) * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/stddef.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/uaccess.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/can-ml.h> #include <linux/ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include "af_can.h" MODULE_DESCRIPTION("Controller Area Network PF_CAN core"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, " "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS_NETPROTO(PF_CAN); static int stats_timer __read_mostly = 1; module_param(stats_timer, int, 0444); MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); static struct kmem_cache *rcv_cache __read_mostly; /* table of registered CAN protocols */ static const struct can_proto __rcu *proto_tab[CAN_NPROTO] __read_mostly; static DEFINE_MUTEX(proto_tab_lock); static atomic_t skbcounter = ATOMIC_INIT(0); /* af_can socket functions */ void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); } EXPORT_SYMBOL(can_sock_destruct); static const struct can_proto *can_get_proto(int protocol) { const struct can_proto *cp; rcu_read_lock(); cp = rcu_dereference(proto_tab[protocol]); if (cp && !try_module_get(cp->prot->owner)) cp = NULL; rcu_read_unlock(); return cp; } static inline void can_put_proto(const struct can_proto *cp) { module_put(cp->prot->owner); } static int can_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; const struct can_proto *cp; int err = 0; sock->state = SS_UNCONNECTED; if (protocol < 0 || protocol >= CAN_NPROTO) return -EINVAL; cp = can_get_proto(protocol); #ifdef CONFIG_MODULES if (!cp) { /* try to load protocol module if kernel is modular */ err = request_module("can-proto-%d", protocol); /* In case of error we only print a message but don't * return the error code immediately. Below we will * return -EPROTONOSUPPORT */ if (err) pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n", protocol); cp = can_get_proto(protocol); } #endif /* check for available protocol and correct usage */ if (!cp) return -EPROTONOSUPPORT; if (cp->type != sock->type) { err = -EPROTOTYPE; goto errout; } sock->ops = cp->ops; sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern); if (!sk) { err = -ENOMEM; goto errout; } sock_init_data(sock, sk); sk->sk_destruct = can_sock_destruct; if (sk->sk_prot->init) err = sk->sk_prot->init(sk); if (err) { /* release sk on errors */ sock_orphan(sk); sock_put(sk); } errout: can_put_proto(cp); return err; } /* af_can tx path */ /** * can_send - transmit a CAN frame (optional with local loopback) * @skb: pointer to socket buffer with CAN frame in data section * @loop: loopback for listeners on local CAN sockets (recommended default!) * * Due to the loopback this routine must not be called from hardirq context. * * Return: * 0 on success * -ENETDOWN when the selected interface is down * -ENOBUFS on full driver queue (see net_xmit_errno()) * -ENOMEM when local loopback failed at calling skb_clone() * -EPERM when trying to send on a non-CAN interface * -EMSGSIZE CAN frame size is bigger than CAN interface MTU * -EINVAL when the skb->data does not contain a valid CAN frame */ int can_send(struct sk_buff *skb, int loop) { struct sk_buff *newskb = NULL; struct canfd_frame *cfd = (struct canfd_frame *)skb->data; struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats; int err = -EINVAL; if (skb->len == CAN_MTU) { skb->protocol = htons(ETH_P_CAN); if (unlikely(cfd->len > CAN_MAX_DLEN)) goto inval_skb; } else if (skb->len == CANFD_MTU) { skb->protocol = htons(ETH_P_CANFD); if (unlikely(cfd->len > CANFD_MAX_DLEN)) goto inval_skb; } else { goto inval_skb; } /* Make sure the CAN frame can pass the selected CAN netdevice. * As structs can_frame and canfd_frame are similar, we can provide * CAN FD frames to legacy CAN drivers as long as the length is <= 8 */ if (unlikely(skb->len > skb->dev->mtu && cfd->len > CAN_MAX_DLEN)) { err = -EMSGSIZE; goto inval_skb; } if (unlikely(skb->dev->type != ARPHRD_CAN)) { err = -EPERM; goto inval_skb; } if (unlikely(!(skb->dev->flags & IFF_UP))) { err = -ENETDOWN; goto inval_skb; } skb->ip_summed = CHECKSUM_UNNECESSARY; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); if (loop) { /* local loopback of sent CAN frames */ /* indication for the CAN driver: do loopback */ skb->pkt_type = PACKET_LOOPBACK; /* The reference to the originating sock may be required * by the receiving socket to check whether the frame is * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS * Therefore we have to ensure that skb->sk remains the * reference to the originating sock by restoring skb->sk * after each skb_clone() or skb_orphan() usage. */ if (!(skb->dev->flags & IFF_ECHO)) { /* If the interface is not capable to do loopback * itself, we do it here. */ newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) { kfree_skb(skb); return -ENOMEM; } can_skb_set_owner(newskb, skb->sk); newskb->ip_summed = CHECKSUM_UNNECESSARY; newskb->pkt_type = PACKET_BROADCAST; } } else { /* indication for the CAN driver: no loopback required */ skb->pkt_type = PACKET_HOST; } /* send to netdevice */ err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); if (err) { kfree_skb(newskb); return err; } if (newskb) netif_rx_ni(newskb); /* update statistics */ pkg_stats->tx_frames++; pkg_stats->tx_frames_delta++; return 0; inval_skb: kfree_skb(skb); return err; } EXPORT_SYMBOL(can_send); /* af_can rx path */ static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net, struct net_device *dev) { if (dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); return &can_ml->dev_rcv_lists; } else { return net->can.rx_alldev_list; } } /** * effhash - hash function for 29 bit CAN identifier reduction * @can_id: 29 bit CAN identifier * * Description: * To reduce the linear traversal in one linked list of _single_ EFF CAN * frame subscriptions the 29 bit identifier is mapped to 10 bits. * (see CAN_EFF_RCV_HASH_BITS definition) * * Return: * Hash value from 0x000 - 0x3FF ( enforced by CAN_EFF_RCV_HASH_BITS mask ) */ static unsigned int effhash(canid_t can_id) { unsigned int hash; hash = can_id; hash ^= can_id >> CAN_EFF_RCV_HASH_BITS; hash ^= can_id >> (2 * CAN_EFF_RCV_HASH_BITS); return hash & ((1 << CAN_EFF_RCV_HASH_BITS) - 1); } /** * can_rcv_list_find - determine optimal filterlist inside device filter struct * @can_id: pointer to CAN identifier of a given can_filter * @mask: pointer to CAN mask of a given can_filter * @dev_rcv_lists: pointer to the device filter struct * * Description: * Returns the optimal filterlist to reduce the filter handling in the * receive path. This function is called by service functions that need * to register or unregister a can_filter in the filter lists. * * A filter matches in general, when * * <received_can_id> & mask == can_id & mask * * so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe * relevant bits for the filter. * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error messages (CAN_ERR_FLAG bit set in mask). For error msg * frames there is a special filterlist and a special rx path filter handling. * * Return: * Pointer to optimal filterlist for the given can_id/mask pair. * Consistency checked mask. * Reduced can_id to have a preprocessed filter compare value. */ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, struct can_dev_rcv_lists *dev_rcv_lists) { canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */ /* filter for error message frames in extra filterlist */ if (*mask & CAN_ERR_FLAG) { /* clear CAN_ERR_FLAG in filter entry */ *mask &= CAN_ERR_MASK; return &dev_rcv_lists->rx[RX_ERR]; } /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */ #define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG) /* ensure valid values in can_mask for 'SFF only' frame filtering */ if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG)) *mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS); /* reduce condition testing at receive time */ *can_id &= *mask; /* inverse can_id/can_mask filter */ if (inv) return &dev_rcv_lists->rx[RX_INV]; /* mask == 0 => no condition testing at receive time */ if (!(*mask)) return &dev_rcv_lists->rx[RX_ALL]; /* extra filterlists for the subscription of a single non-RTR can_id */ if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) && !(*can_id & CAN_RTR_FLAG)) { if (*can_id & CAN_EFF_FLAG) { if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_eff[effhash(*can_id)]; } else { if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_sff[*can_id]; } } /* default: filter via can_id/can_mask */ return &dev_rcv_lists->rx[RX_FIL]; } /** * can_rx_register - subscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list) * @can_id: CAN identifier (see description) * @mask: CAN mask (see description) * @func: callback function on filter match * @data: returned parameter for callback function * @ident: string for calling module identification * @sk: socket pointer (might be NULL) * * Description: * Invokes the callback function with the received sk_buff and the given * parameter 'data' on a matching receive filter. A filter matches, when * * <received_can_id> & mask == can_id & mask * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error message frames (CAN_ERR_FLAG bit set in mask). * * The provided pointer to the sk_buff is guaranteed to be valid as long as * the callback function is running. The callback function must *not* free * the given sk_buff while processing it's task. When the given sk_buff is * needed after the end of the callback function it must be cloned inside * the callback function with skb_clone(). * * Return: * 0 on success * -ENOMEM on missing cache mem to create subscription entry * -ENODEV unknown device */ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data, char *ident, struct sock *sk) { struct receiver *rcv; struct hlist_head *rcv_list; struct can_dev_rcv_lists *dev_rcv_lists; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; int err = 0; /* insert new receiver (dev,canid,mask) -> (func,data) */ if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev))) return -ENODEV; if (dev && !net_eq(net, dev_net(dev))) return -ENODEV; rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL); if (!rcv) return -ENOMEM; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); rcv->can_id = can_id; rcv->mask = mask; rcv->matches = 0; rcv->func = func; rcv->data = data; rcv->ident = ident; rcv->sk = sk; hlist_add_head_rcu(&rcv->list, rcv_list); dev_rcv_lists->entries++; rcv_lists_stats->rcv_entries++; rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max, rcv_lists_stats->rcv_entries); spin_unlock_bh(&net->can.rcvlists_lock); return err; } EXPORT_SYMBOL(can_rx_register); /* can_rx_delete_receiver - rcu callback for single receiver entry removal */ static void can_rx_delete_receiver(struct rcu_head *rp) { struct receiver *rcv = container_of(rp, struct receiver, rcu); struct sock *sk = rcv->sk; kmem_cache_free(rcv_cache, rcv); if (sk) sock_put(sk); } /** * can_rx_unregister - unsubscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list) * @can_id: CAN identifier * @mask: CAN mask * @func: callback function on filter match * @data: returned parameter for callback function * * Description: * Removes subscription entry depending on given (subscription) values. */ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data) { struct receiver *rcv = NULL; struct hlist_head *rcv_list; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; struct can_dev_rcv_lists *dev_rcv_lists; if (dev && dev->type != ARPHRD_CAN) return; if (dev && !net_eq(net, dev_net(dev))) return; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); /* Search the receiver list for the item to delete. This should * exist, since no receiver may be unregistered that hasn't * been registered before. */ hlist_for_each_entry_rcu(rcv, rcv_list, list) { if (rcv->can_id == can_id && rcv->mask == mask && rcv->func == func && rcv->data == data) break; } /* Check for bugs in CAN protocol implementations using af_can.c: * 'rcv' will be NULL if no matching list item was found for removal. * As this case may potentially happen when closing a socket while * the notifier for removing the CAN netdev is running we just print * a warning here. */ if (!rcv) { pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n", DNAME(dev), can_id, mask); goto out; } hlist_del_rcu(&rcv->list); dev_rcv_lists->entries--; if (rcv_lists_stats->rcv_entries > 0) rcv_lists_stats->rcv_entries--; out: spin_unlock_bh(&net->can.rcvlists_lock); /* schedule the receiver item for deletion */ if (rcv) { if (rcv->sk) sock_hold(rcv->sk); call_rcu(&rcv->rcu, can_rx_delete_receiver); } } EXPORT_SYMBOL(can_rx_unregister); static inline void deliver(struct sk_buff *skb, struct receiver *rcv) { rcv->func(skb, rcv->data); rcv->matches++; } static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb) { struct receiver *rcv; int matches = 0; struct can_frame *cf = (struct can_frame *)skb->data; canid_t can_id = cf->can_id; if (dev_rcv_lists->entries == 0) return 0; if (can_id & CAN_ERR_FLAG) { /* check for error message frame entries only */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) { if (can_id & rcv->mask) { deliver(skb, rcv); matches++; } } return matches; } /* check for unfiltered entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) { deliver(skb, rcv); matches++; } /* check for can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) { if ((can_id & rcv->mask) == rcv->can_id) { deliver(skb, rcv); matches++; } } /* check for inverted can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) { if ((can_id & rcv->mask) != rcv->can_id) { deliver(skb, rcv); matches++; } } /* check filterlists for single non-RTR can_ids */ if (can_id & CAN_RTR_FLAG) return matches; if (can_id & CAN_EFF_FLAG) { hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) { if (rcv->can_id == can_id) { deliver(skb, rcv); matches++; } } } else { can_id &= CAN_SFF_MASK; hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) { deliver(skb, rcv); matches++; } } return matches; } static void can_receive(struct sk_buff *skb, struct net_device *dev) { struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = dev_net(dev); struct can_pkg_stats *pkg_stats = net->can.pkg_stats; int matches; /* update statistics */ pkg_stats->rx_frames++; pkg_stats->rx_frames_delta++; /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter); rcu_read_lock(); /* deliver the packet to sockets listening on all devices */ matches = can_rcv_filter(net->can.rx_alldev_list, skb); /* find receive list for this device */ dev_rcv_lists = can_dev_rcv_lists_find(net, dev); matches += can_rcv_filter(dev_rcv_lists, skb); rcu_read_unlock(); /* consume the skbuff allocated by the netdevice driver */ consume_skb(skb); if (matches > 0) { pkg_stats->matches++; pkg_stats->matches_delta++; } } static int can_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || skb->len != CAN_MTU)) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); goto free_skb; } /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */ if (unlikely(cfd->len > CAN_MAX_DLEN)) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d, datalen %d\n", dev->type, skb->len, cfd->len); goto free_skb; } can_receive(skb, dev); return NET_RX_SUCCESS; free_skb: kfree_skb(skb); return NET_RX_DROP; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || skb->len != CANFD_MTU)) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); goto free_skb; } /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */ if (unlikely(cfd->len > CANFD_MAX_DLEN)) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d, datalen %d\n", dev->type, skb->len, cfd->len); goto free_skb; } can_receive(skb, dev); return NET_RX_SUCCESS; free_skb: kfree_skb(skb); return NET_RX_DROP; } /* af_can protocol functions */ /** * can_proto_register - register CAN transport protocol * @cp: pointer to CAN protocol structure * * Return: * 0 on success * -EINVAL invalid (out of range) protocol number * -EBUSY protocol already in use * -ENOBUF if proto_register() fails */ int can_proto_register(const struct can_proto *cp) { int proto = cp->protocol; int err = 0; if (proto < 0 || proto >= CAN_NPROTO) { pr_err("can: protocol number %d out of range\n", proto); return -EINVAL; } err = proto_register(cp->prot, 0); if (err < 0) return err; mutex_lock(&proto_tab_lock); if (rcu_access_pointer(proto_tab[proto])) { pr_err("can: protocol %d already registered\n", proto); err = -EBUSY; } else { RCU_INIT_POINTER(proto_tab[proto], cp); } mutex_unlock(&proto_tab_lock); if (err < 0) proto_unregister(cp->prot); return err; } EXPORT_SYMBOL(can_proto_register); /** * can_proto_unregister - unregister CAN transport protocol * @cp: pointer to CAN protocol structure */ void can_proto_unregister(const struct can_proto *cp) { int proto = cp->protocol; mutex_lock(&proto_tab_lock); BUG_ON(rcu_access_pointer(proto_tab[proto]) != cp); RCU_INIT_POINTER(proto_tab[proto], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); proto_unregister(cp->prot); } EXPORT_SYMBOL(can_proto_unregister); static int can_pernet_init(struct net *net) { spin_lock_init(&net->can.rcvlists_lock); net->can.rx_alldev_list = kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL); if (!net->can.rx_alldev_list) goto out; net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL); if (!net->can.pkg_stats) goto out_free_rx_alldev_list; net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL); if (!net->can.rcv_lists_stats) goto out_free_pkg_stats; if (IS_ENABLED(CONFIG_PROC_FS)) { /* the statistics are updated every second (timer triggered) */ if (stats_timer) { timer_setup(&net->can.stattimer, can_stat_update, 0); mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); } net->can.pkg_stats->jiffies_init = jiffies; can_init_proc(net); } return 0; out_free_pkg_stats: kfree(net->can.pkg_stats); out_free_rx_alldev_list: kfree(net->can.rx_alldev_list); out: return -ENOMEM; } static void can_pernet_exit(struct net *net) { if (IS_ENABLED(CONFIG_PROC_FS)) { can_remove_proc(net); if (stats_timer) del_timer_sync(&net->can.stattimer); } kfree(net->can.rx_alldev_list); kfree(net->can.pkg_stats); kfree(net->can.rcv_lists_stats); } /* af_can module init/exit functions */ static struct packet_type can_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CAN), .func = can_rcv, }; static struct packet_type canfd_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CANFD), .func = canfd_rcv, }; static const struct net_proto_family can_family_ops = { .family = PF_CAN, .create = can_create, .owner = THIS_MODULE, }; static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, }; static __init int can_init(void) { int err; /* check for correct padding to be able to use the structs similarly */ BUILD_BUG_ON(offsetof(struct can_frame, len) != offsetof(struct canfd_frame, len) || offsetof(struct can_frame, data) != offsetof(struct canfd_frame, data)); pr_info("can: controller area network core\n"); rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver), 0, 0, NULL); if (!rcv_cache) return -ENOMEM; err = register_pernet_subsys(&can_pernet_ops); if (err) goto out_pernet; /* protocol register */ err = sock_register(&can_family_ops); if (err) goto out_sock; dev_add_pack(&can_packet); dev_add_pack(&canfd_packet); return 0; out_sock: unregister_pernet_subsys(&can_pernet_ops); out_pernet: kmem_cache_destroy(rcv_cache); return err; } static __exit void can_exit(void) { /* protocol unregister */ dev_remove_pack(&canfd_packet); dev_remove_pack(&can_packet); sock_unregister(PF_CAN); unregister_pernet_subsys(&can_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(rcv_cache); } module_init(can_init); module_exit(can_exit);
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include <linux/export.h> #include <linux/etherdevice.h> #include <net/mac80211.h> #include <asm/unaligned.h> #include "ieee80211_i.h" #include "rate.h" #include "mesh.h" #include "led.h" #include "wme.h" void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tmp; skb->pkt_type = IEEE80211_TX_STATUS_MSG; skb_queue_tail(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS ? &local->skb_queue : &local->skb_queue_unreliable, skb); tmp = skb_queue_len(&local->skb_queue) + skb_queue_len(&local->skb_queue_unreliable); while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT && (skb = skb_dequeue(&local->skb_queue_unreliable))) { ieee80211_free_txskb(hw, skb); tmp--; I802_DEBUG_INC(local->tx_status_drop); } tasklet_schedule(&local->tasklet); } EXPORT_SYMBOL(ieee80211_tx_status_irqsafe); static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; int ac; if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_CTL_AMPDU | IEEE80211_TX_CTL_HW_80211_ENCAP)) { ieee80211_free_txskb(&local->hw, skb); return; } /* * This skb 'survived' a round-trip through the driver, and * hopefully the driver didn't mangle it too badly. However, * we can definitely not rely on the control information * being correct. Clear it so we don't get junk there, and * indicate that it needs new processing, but must not be * modified/encrypted again. */ memset(&info->control, 0, sizeof(info->control)); info->control.jiffies = jiffies; info->control.vif = &sta->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; sta->status_stats.filtered++; /* * Clear more-data bit on filtered frames, it might be set * but later frames might time out so it might have to be * clear again ... It's all rather unlikely (this frame * should time out first, right?) but let's not confuse * peers unnecessarily. */ if (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_MOREDATA)) hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control)) { u8 *p = ieee80211_get_qos_ctl(hdr); int tid = *p & IEEE80211_QOS_CTL_TID_MASK; /* * Clear EOSP if set, this could happen e.g. * if an absence period (us being a P2P GO) * shortens the SP. */ if (*p & IEEE80211_QOS_CTL_EOSP) *p &= ~IEEE80211_QOS_CTL_EOSP; ac = ieee80211_ac_from_tid(tid); } else { ac = IEEE80211_AC_BE; } /* * Clear the TX filter mask for this STA when sending the next * packet. If the STA went to power save mode, this will happen * when it wakes up for the next time. */ set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT); ieee80211_clear_fast_xmit(sta); /* * This code races in the following way: * * (1) STA sends frame indicating it will go to sleep and does so * (2) hardware/firmware adds STA to filter list, passes frame up * (3) hardware/firmware processes TX fifo and suppresses a frame * (4) we get TX status before having processed the frame and * knowing that the STA has gone to sleep. * * This is actually quite unlikely even when both those events are * processed from interrupts coming in quickly after one another or * even at the same time because we queue both TX status events and * RX frames to be processed by a tasklet and process them in the * same order that they were received or TX status last. Hence, there * is no race as long as the frame RX is processed before the next TX * status, which drivers can ensure, see below. * * Note that this can only happen if the hardware or firmware can * actually add STAs to the filter list, if this is done by the * driver in response to set_tim() (which will only reduce the race * this whole filtering tries to solve, not completely solve it) * this situation cannot happen. * * To completely solve this race drivers need to make sure that they * (a) don't mix the irq-safe/not irq-safe TX status/RX processing * functions and * (b) always process RX events before TX status events if ordering * can be unknown, for example with different interrupt status * bits. * (c) if PS mode transitions are manual (i.e. the flag * %IEEE80211_HW_AP_LINK_PS is set), always process PS state * changes before calling TX status events if ordering can be * unknown. */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && skb_queue_len(&sta->tx_filtered[ac]) < STA_MAX_TX_BUFFER) { skb_queue_tail(&sta->tx_filtered[ac], skb); sta_info_recalc_tim(sta); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); return; } if (!test_sta_flag(sta, WLAN_STA_PS_STA) && !(info->flags & IEEE80211_TX_INTFL_RETRIED)) { /* Software retry the packet once */ info->flags |= IEEE80211_TX_INTFL_RETRIED; ieee80211_add_pending_skb(local, skb); return; } ps_dbg_ratelimited(sta->sdata, "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", skb_queue_len(&sta->tx_filtered[ac]), !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies); ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx || !tid_tx->bar_pending) return; tid_tx->bar_pending = false; ieee80211_send_bar(&sta->sdata->vif, addr, tid, tid_tx->failed_bar_ssn); } static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *) skb->data; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qc = ieee80211_get_qos_ctl(hdr); u16 tid = qc[0] & 0xf; ieee80211_check_pending_bar(sta, hdr->addr1, tid); } if (ieee80211_is_action(mgmt->frame_control) && !ieee80211_has_protected(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_HT && mgmt->u.action.u.ht_smps.action == WLAN_HT_ACTION_SMPS && ieee80211_sdata_running(sdata)) { enum ieee80211_smps_mode smps_mode; switch (mgmt->u.action.u.ht_smps.smps_control) { case WLAN_HT_SMPS_CONTROL_DYNAMIC: smps_mode = IEEE80211_SMPS_DYNAMIC; break; case WLAN_HT_SMPS_CONTROL_STATIC: smps_mode = IEEE80211_SMPS_STATIC; break; case WLAN_HT_SMPS_CONTROL_DISABLED: default: /* shouldn't happen since we don't send that */ smps_mode = IEEE80211_SMPS_OFF; break; } if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* * This update looks racy, but isn't -- if we come * here we've definitely got a station that we're * talking to, and on a managed interface that can * only be the AP. And the only other place updating * this variable in managed mode is before association. */ sdata->smps_mode = smps_mode; ieee80211_queue_work(&local->hw, &sdata->recalc_smps); } else if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { sta->known_smps_mode = smps_mode; } } } static void ieee80211_set_bar_pending(struct sta_info *sta, u8 tid, u16 ssn) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx) return; tid_tx->failed_bar_ssn = ssn; tid_tx->bar_pending = true; } static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info, struct ieee80211_tx_status *status) { int len = sizeof(struct ieee80211_radiotap_header); /* IEEE80211_RADIOTAP_RATE rate */ if (status && status->rate && !(status->rate->flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) len += 2; else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) len += 2; /* IEEE80211_RADIOTAP_TX_FLAGS */ len += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ len += 1; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (status && status->rate) { if (status->rate->flags & RATE_INFO_FLAGS_MCS) len += 3; else if (status->rate->flags & RATE_INFO_FLAGS_VHT_MCS) len = ALIGN(len, 2) + 12; else if (status->rate->flags & RATE_INFO_FLAGS_HE_MCS) len = ALIGN(len, 2) + 12; } else if (info->status.rates[0].idx >= 0) { if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) len += 3; else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) len = ALIGN(len, 2) + 12; } return len; } static void ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct sk_buff *skb, int retry_count, int rtap_len, int shift, struct ieee80211_tx_status *status) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_radiotap_header *rthdr; unsigned char *pos; u16 legacy_rate = 0; u16 txflags; rthdr = skb_push(skb, rtap_len); memset(rthdr, 0, rtap_len); rthdr->it_len = cpu_to_le16(rtap_len); rthdr->it_present = cpu_to_le32(BIT(IEEE80211_RADIOTAP_TX_FLAGS) | BIT(IEEE80211_RADIOTAP_DATA_RETRIES)); pos = (unsigned char *)(rthdr + 1); /* * XXX: Once radiotap gets the bitmap reset thing the vendor * extensions proposal contains, we can actually report * the whole set of tries we did. */ /* IEEE80211_RADIOTAP_RATE */ if (status && status->rate) { if (!(status->rate->flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) legacy_rate = status->rate->legacy; } else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) legacy_rate = sband->bitrates[info->status.rates[0].idx].bitrate; if (legacy_rate) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); *pos = DIV_ROUND_UP(legacy_rate, 5 * (1 << shift)); /* padding for tx flags */ pos += 2; } /* IEEE80211_RADIOTAP_TX_FLAGS */ txflags = 0; if (!(info->flags & IEEE80211_TX_STAT_ACK) && !is_multicast_ether_addr(hdr->addr1)) txflags |= IEEE80211_RADIOTAP_F_TX_FAIL; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) txflags |= IEEE80211_RADIOTAP_F_TX_CTS; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) txflags |= IEEE80211_RADIOTAP_F_TX_RTS; put_unaligned_le16(txflags, pos); pos += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ /* for now report the total retry_count */ *pos = retry_count; pos++; if (status && status->rate && (status->rate->flags & RATE_INFO_FLAGS_MCS)) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (status->rate->flags & RATE_INFO_FLAGS_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (status->rate->bw == RATE_INFO_BW_40) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; pos[2] = status->rate->mcs; pos += 3; } else if (status && status->rate && (status->rate->flags & RATE_INFO_FLAGS_VHT_MCS)) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (status->rate->flags & RATE_INFO_FLAGS_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ switch (status->rate->bw) { case RATE_INFO_BW_160: *pos = 11; break; case RATE_INFO_BW_80: *pos = 4; break; case RATE_INFO_BW_40: *pos = 1; break; default: *pos = 0; break; } pos++; /* u8 mcs_nss[4] */ *pos = (status->rate->mcs << 4) | status->rate->nss; pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } else if (status && status->rate && (status->rate->flags & RATE_INFO_FLAGS_HE_MCS)) { struct ieee80211_radiotap_he *he; rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); he = (struct ieee80211_radiotap_he *)pos; he->data1 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA1_FORMAT_SU | IEEE80211_RADIOTAP_HE_DATA1_DATA_MCS_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_DATA_DCM_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_BW_RU_ALLOC_KNOWN); he->data2 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA2_GI_KNOWN); #define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f) he->data6 |= HE_PREP(DATA6_NSTS, status->rate->nss); #define CHECK_GI(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ (int)NL80211_RATE_INFO_HE_GI_##s) CHECK_GI(0_8); CHECK_GI(1_6); CHECK_GI(3_2); he->data3 |= HE_PREP(DATA3_DATA_MCS, status->rate->mcs); he->data3 |= HE_PREP(DATA3_DATA_DCM, status->rate->he_dcm); he->data5 |= HE_PREP(DATA5_GI, status->rate->he_gi); switch (status->rate->bw) { case RATE_INFO_BW_20: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); break; case RATE_INFO_BW_40: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); break; case RATE_INFO_BW_80: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); break; case RATE_INFO_BW_160: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); break; case RATE_INFO_BW_HE_RU: #define CHECK_RU_ALLOC(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) CHECK_RU_ALLOC(26); CHECK_RU_ALLOC(52); CHECK_RU_ALLOC(106); CHECK_RU_ALLOC(242); CHECK_RU_ALLOC(484); CHECK_RU_ALLOC(996); CHECK_RU_ALLOC(2x996); he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, status->rate->he_ru_alloc + 4); break; default: WARN_ONCE(1, "Invalid SU BW %d\n", status->rate->bw); } pos += sizeof(struct ieee80211_radiotap_he); } if ((status && status->rate) || info->status.rates[0].idx < 0) return; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; if (info->status.rates[0].flags & IEEE80211_TX_RC_GREEN_FIELD) pos[1] |= IEEE80211_RADIOTAP_MCS_FMT_GF; pos[2] = info->status.rates[0].idx; pos += 3; } else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) *pos = 1; else if (info->status.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH) *pos = 4; else if (info->status.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH) *pos = 11; else /* IEEE80211_TX_RC_{20_MHZ_WIDTH,FIXME:DUP_DATA} */ *pos = 0; pos++; /* u8 mcs_nss[4] */ *pos = (ieee80211_rate_get_vht_mcs(&info->status.rates[0]) << 4) | ieee80211_rate_get_vht_nss(&info->status.rates[0]); pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } } /* * Handles the tx for TDLS teardown frames. * If the frame wasn't ACKed by the peer - it will be re-sent through the AP */ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 flags) { struct sk_buff *teardown_skb; struct sk_buff *orig_teardown_skb; bool is_teardown = false; /* Get the teardown data we need and free the lock */ spin_lock(&sdata->u.mgd.teardown_lock); teardown_skb = sdata->u.mgd.teardown_skb; orig_teardown_skb = sdata->u.mgd.orig_teardown_skb; if ((skb == orig_teardown_skb) && teardown_skb) { sdata->u.mgd.teardown_skb = NULL; sdata->u.mgd.orig_teardown_skb = NULL; is_teardown = true; } spin_unlock(&sdata->u.mgd.teardown_lock); if (is_teardown) { /* This mechanism relies on being able to get ACKs */ WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)); /* Check if peer has ACKed */ if (flags & IEEE80211_TX_STAT_ACK) { dev_kfree_skb_any(teardown_skb); } else { tdls_dbg(sdata, "TDLS Resending teardown through AP\n"); ieee80211_subif_start_xmit(teardown_skb, skb->dev); } } } static struct ieee80211_sub_if_data * ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; if (skb->dev) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; if (skb->dev == sdata->dev) return sdata; } return NULL; } return rcu_dereference(local->p2p_sdata); } static void ieee80211_report_ack_skb(struct ieee80211_local *local, struct ieee80211_tx_info *info, bool acked, bool dropped) { struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&local->ack_status_lock, flags); skb = idr_remove(&local->ack_status_frames, info->ack_frame_id); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (!skb) return; if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr = (void *)skb->data; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (sdata) { if (skb->protocol == sdata->control_port_protocol || skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) cfg80211_control_port_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); else if (ieee80211_is_any_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, info->status.ack_signal, info->status.is_valid_ack_signal, GFP_ATOMIC); else if (ieee80211_is_mgmt(hdr->frame_control)) cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); else pr_warn("Unknown status report in ack skb\n"); } rcu_read_unlock(); dev_kfree_skb_any(skb); } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ skb_complete_wifi_ack(skb, acked); } } static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 tx_time_est = ieee80211_info_get_tx_time_est(info); struct ieee80211_hdr *hdr = (void *)skb->data; bool acked = info->flags & IEEE80211_TX_STAT_ACK; if (dropped) acked = false; if (tx_time_est) { struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); rcu_read_unlock(); } if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (!sdata) { skb->dev = NULL; } else { unsigned int hdr_size = ieee80211_hdrlen(hdr->frame_control); /* Check to see if packet is a TDLS teardown packet */ if (ieee80211_is_data(hdr->frame_control) && (ieee80211_get_tdls_action(skb, hdr_size) == WLAN_TDLS_TEARDOWN)) { ieee80211_tdls_td_tx_handle(local, sdata, skb, info->flags); } else if (ieee80211_s1g_is_twt_setup(skb)) { if (!acked) { struct sk_buff *qskb; qskb = skb_clone(skb, GFP_ATOMIC); if (qskb) { skb_queue_tail(&sdata->status_queue, qskb); ieee80211_queue_work(&local->hw, &sdata->work); } } } else { ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); } } rcu_read_unlock(); } else if (info->ack_frame_id) { ieee80211_report_ack_skb(local, info, acked, dropped); } if (!dropped && skb->destructor) { skb->wifi_acked_valid = 1; skb->wifi_acked = acked; } ieee80211_led_tx(local); if (skb_has_frag_list(skb)) { kfree_skb_list(skb_shinfo(skb)->frag_list); skb_shinfo(skb)->frag_list = NULL; } } /* * Use a static threshold for now, best value to be determined * by testing ... * Should it depend on: * - on # of retransmissions * - current throughput (higher value for higher tpt)? */ #define STA_LOST_PKT_THRESHOLD 50 #define STA_LOST_PKT_TIME HZ /* 1 sec since last ACK */ #define STA_LOST_TDLS_PKT_THRESHOLD 10 #define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */ static void ieee80211_lost_packet(struct sta_info *sta, struct ieee80211_tx_info *info) { unsigned long pkt_time = STA_LOST_PKT_TIME; unsigned int pkt_thr = STA_LOST_PKT_THRESHOLD; /* If driver relies on its own algorithm for station kickout, skip * mac80211 packet loss mechanism. */ if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK)) return; /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; sta->status_stats.lost_packets++; if (sta->sta.tdls) { pkt_time = STA_LOST_TDLS_PKT_TIME; pkt_thr = STA_LOST_PKT_THRESHOLD; } /* * If we're in TDLS mode, make sure that all STA_LOST_TDLS_PKT_THRESHOLD * of the last packets were lost, and that no ACK was received in the * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss * mechanism. * For non-TDLS, use STA_LOST_PKT_THRESHOLD and STA_LOST_PKT_TIME */ if (sta->status_stats.lost_packets < pkt_thr || !time_after(jiffies, sta->status_stats.last_pkt_time + pkt_time)) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, sta->status_stats.lost_packets, GFP_ATOMIC); sta->status_stats.lost_packets = 0; } static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int *retry_count) { int count = -1; int i; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) { /* just the first aggr frame carry status info */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } else if (info->status.rates[i].idx < 0) { break; } else if (i >= hw->max_report_rates) { /* the HW cannot have attempted that rate */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } count += info->status.rates[i].count; } if (count < 0) count = 0; *retry_count = count; return i - 1; } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_supported_band *sband, int retry_count, int shift, bool send_to_cooked, struct ieee80211_tx_status *status) { struct sk_buff *skb2; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sub_if_data *sdata; struct net_device *prev_dev = NULL; int rtap_len; /* send frame to monitor interfaces now */ rtap_len = ieee80211_tx_radiotap_len(info, status); if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { pr_err("ieee80211_tx_status: headroom too small\n"); dev_kfree_skb(skb); return; } ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count, rtap_len, shift, status); /* XXX: is this sufficient for BPF? */ skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { if (!ieee80211_sdata_running(sdata)) continue; if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) && !send_to_cooked) continue; if (prev_dev) { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { skb2->dev = prev_dev; netif_rx(skb2); } } prev_dev = sdata->dev; } } if (prev_dev) { skb->dev = prev_dev; netif_rx(skb); skb = NULL; } rcu_read_unlock(); dev_kfree_skb(skb); } static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_tx_status *status, int rates_idx, int retry_count) { struct sk_buff *skb = status->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct sta_info *sta; __le16 fc; struct ieee80211_supported_band *sband; bool send_to_cooked; bool acked; bool noack_success; struct ieee80211_bar *bar; int shift = 0; int tid = IEEE80211_NUM_TIDS; sband = local->hw.wiphy->bands[info->band]; fc = hdr->frame_control; if (status->sta) { sta = container_of(status->sta, struct sta_info, sta); shift = ieee80211_vif_get_shift(&sta->sdata->vif); if (info->flags & IEEE80211_TX_STATUS_EOSP) clear_sta_flag(sta, WLAN_STA_SP); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); /* mesh Peer Service Period support */ if (ieee80211_vif_is_mesh(&sta->sdata->vif) && ieee80211_is_data_qos(fc)) ieee80211_mpsp_trigger_process( ieee80211_get_qos_ctl(hdr), sta, true, acked); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->tx_stats.last_rate = info->status.rates[rates_idx]; if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) && (ieee80211_is_data_qos(fc))) { u16 ssn; u8 *qc; qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10) & IEEE80211_SCTL_SEQ); ieee80211_send_bar(&sta->sdata->vif, hdr->addr1, tid, ssn); } else if (ieee80211_is_data_qos(fc)) { u8 *qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; } if (!acked && ieee80211_is_back_req(fc)) { u16 control; /* * BAR failed, store the last SSN and retry sending * the BAR when the next unicast transmission on the * same TID succeeds. */ bar = (struct ieee80211_bar *) skb->data; control = le16_to_cpu(bar->control); if (!(control & IEEE80211_BAR_CTRL_MULTI_TID)) { u16 ssn = le16_to_cpu(bar->start_seq_num); tid = (control & IEEE80211_BAR_CTRL_TID_INFO_MASK) >> IEEE80211_BAR_CTRL_TID_INFO_SHIFT; ieee80211_set_bar_pending(sta, tid, ssn); } } if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) { ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (ieee80211_is_data_present(fc)) { if (!acked && !noack_success) sta->status_stats.msdu_failed[tid]++; sta->status_stats.msdu_retries[tid] += retry_count; } if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); } else if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) { struct ieee80211_sub_if_data *sdata; struct ieee80211_txq *txq; u32 airtime; /* Account airtime to multicast queue */ sdata = ieee80211_sdata_from_skb(local, skb); if (sdata && (txq = sdata->vif.txq)) { airtime = info->status.tx_time ?: ieee80211_calc_expected_tx_airtime(hw, &sdata->vif, NULL, skb->len, false); ieee80211_register_airtime(txq, airtime, 0); } } /* SNMP counters * Fragments are passed to low-level drivers as separate skbs, so these * are actually fragments, not frames. Update frame counters only for * the first fragment of the frame. */ if ((info->flags & IEEE80211_TX_STAT_ACK) || (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) { if (ieee80211_is_first_frag(hdr->seq_ctrl)) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (is_multicast_ether_addr(ieee80211_get_DA(hdr))) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } /* This counter shall be incremented for an acknowledged MPDU * with an individual address in the address 1 field or an MPDU * with a multicast address in the address 1 field of type Data * or Management. */ if (!is_multicast_ether_addr(hdr->addr1) || ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) I802_DEBUG_INC(local->dot11TransmittedFragmentCount); } else { if (ieee80211_is_first_frag(hdr->seq_ctrl)) I802_DEBUG_INC(local->dot11FailedCount); } if (ieee80211_is_any_nullfunc(fc) && ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { if (info->flags & IEEE80211_TX_STAT_ACK) local->ps_sdata->u.mgd.flags |= IEEE80211_STA_NULLFUNC_ACKED; mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(10)); } ieee80211_report_used_skb(local, skb, false); /* this was a transmitted frame, but now we want to reuse it */ skb_orphan(skb); /* Need to make a copy before skb->cb gets cleared */ send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) || !(ieee80211_is_data(fc)); /* * This is a bit racy but we can avoid a lot of work * with this test... */ if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) { if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); return; } /* send to monitor interfaces */ ieee80211_tx_monitor(local, skb, sband, retry_count, shift, send_to_cooked, status); } void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_status status = { .skb = skb, .info = IEEE80211_SKB_CB(skb), }; struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); if (sta) status.sta = &sta->sta; ieee80211_tx_status_ext(hw, &status); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_tx_status); void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_status *status) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct ieee80211_sta *pubsta = status->sta; struct sk_buff *skb = status->skb; struct ieee80211_supported_band *sband; struct sta_info *sta = NULL; int rates_idx, retry_count; bool acked, noack_success; u16 tx_time_est; if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); if (status->rate) sta->tx_stats.last_rate_info = *status->rate; } if (skb && (tx_time_est = ieee80211_info_get_tx_time_est(IEEE80211_SKB_CB(skb))) > 0) { /* Do this here to avoid the expensive lookup of the sta * in ieee80211_report_used_skb(). */ ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); ieee80211_info_set_tx_time_est(IEEE80211_SKB_CB(skb), 0); } if (!status->info) goto free; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); sband = hw->wiphy->bands[info->band]; acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); if (pubsta) { struct ieee80211_sub_if_data *sdata = sta->sdata; if (!acked && !noack_success) sta->status_stats.retry_failed++; sta->status_stats.retry_count += retry_count; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (sdata->vif.type == NL80211_IFTYPE_STATION && skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) ieee80211_sta_tx_notify(sdata, (void *) skb->data, acked, info->status.tx_time); if (acked) { sta->status_stats.last_ack = jiffies; if (sta->status_stats.lost_packets) sta->status_stats.lost_packets = 0; /* Track when last packet was ACKed */ sta->status_stats.last_pkt_time = jiffies; /* Reset connection monitor */ if (sdata->vif.type == NL80211_IFTYPE_STATION && unlikely(sdata->u.mgd.probe_send_count > 0)) sdata->u.mgd.probe_send_count = 0; if (info->status.is_valid_ack_signal) { sta->status_stats.last_ack_signal = (s8)info->status.ack_signal; sta->status_stats.ack_signal_filled = true; ewma_avg_signal_add(&sta->status_stats.avg_ack_signal, -info->status.ack_signal); } } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { /* * The STA is in power save mode, so assume * that this TX packet failed because of that. */ if (skb) ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (noack_success) { /* nothing to do here, do not account as lost */ } else { ieee80211_lost_packet(sta, info); } } rate_control_tx_status(local, sband, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) ieee80211s_update_metric(local, sta, status); } if (skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) return __ieee80211_tx_status(hw, status, rates_idx, retry_count); if (acked || noack_success) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (!pubsta) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } else { I802_DEBUG_INC(local->dot11FailedCount); } free: if (!skb) return; ieee80211_report_used_skb(local, skb, false); if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_tx_status_ext); void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_tx_info *info) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_supported_band *sband = hw->wiphy->bands[info->band]; struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_tx_status status = { .info = info, .sta = pubsta, }; rate_control_tx_status(local, sband, &status); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) sta->tx_stats.last_rate = info->status.rates[0]; } EXPORT_SYMBOL(ieee80211_tx_rate_update); void ieee80211_tx_status_8023(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; struct ieee80211_tx_status status = { .skb = skb, .info = IEEE80211_SKB_CB(skb), }; struct sta_info *sta; sdata = vif_to_sdata(vif); rcu_read_lock(); if (!ieee80211_lookup_ra_sta(sdata, skb, &sta) && !IS_ERR(sta)) status.sta = &sta->sta; ieee80211_tx_status_ext(hw, &status); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_tx_status_8023); void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, num_packets, GFP_ATOMIC); } EXPORT_SYMBOL(ieee80211_report_low_ack); void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); ieee80211_report_used_skb(local, skb, true); dev_kfree_skb_any(skb); } EXPORT_SYMBOL(ieee80211_free_txskb); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs) { struct sk_buff *skb; while ((skb = __skb_dequeue(skbs))) ieee80211_free_txskb(hw, skb); }
16 14 12 2 13 1 62 63 63 63 63 62 10 62 53 61 2 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2003, 2004 * * This file is part of the SCTP kernel implementation * * This file contains the code relating the chunk abstraction. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* This file is mostly in anticipation of future work, but initially * populate with fragment tracking for an outbound message. */ /* Initialize datamsg from memory. */ static void sctp_datamsg_init(struct sctp_datamsg *msg) { refcount_set(&msg->refcnt, 1); msg->send_failed = 0; msg->send_error = 0; msg->can_delay = 1; msg->abandoned = 0; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); } /* Allocate and initialize datamsg. */ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) { struct sctp_datamsg *msg; msg = kmalloc(sizeof(struct sctp_datamsg), gfp); if (msg) { sctp_datamsg_init(msg); SCTP_DBG_OBJCNT_INC(datamsg); } return msg; } void sctp_datamsg_free(struct sctp_datamsg *msg) { struct sctp_chunk *chunk; /* This doesn't have to be a _safe vairant because * sctp_chunk_free() only drops the refs. */ list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_chunk_free(chunk); sctp_datamsg_put(msg); } /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { struct sctp_association *asoc = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); if (!msg->send_failed) { sctp_chunk_put(chunk); continue; } asoc = chunk->asoc; error = msg->send_error ?: asoc->outqueue.error; sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED_EVENT)) { ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_chunk_put(chunk); } SCTP_DBG_OBJCNT_DEC(datamsg); kfree(msg); } /* Hold a reference. */ static void sctp_datamsg_hold(struct sctp_datamsg *msg) { refcount_inc(&msg->refcnt); } /* Release a reference. */ void sctp_datamsg_put(struct sctp_datamsg *msg) { if (refcount_dec_and_test(&msg->refcnt)) sctp_datamsg_destroy(msg); } /* Assign a chunk to this datamsg. */ static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk) { sctp_datamsg_hold(msg); chunk->msg = msg; } /* A data chunk can have a maximum payload of (2^16 - 20). Break * down any such message into smaller chunks. Opportunistically, fragment * the chunks down to the current MTU constraints. We may get refragmented * later if the PMTU changes, but it is _much better_ to fragment immediately * with a reasonable guess than always doing our fragmentation on the * soft-interrupt. */ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct iov_iter *from) { size_t len, first_len, max_data, remaining; size_t msg_len = iov_iter_count(from); struct sctp_shared_key *shkey = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; int err; msg = sctp_datamsg_new(GFP_KERNEL); if (!msg) return ERR_PTR(-ENOMEM); /* Note: Calculate this outside of the loop, so that all fragments * have the same expiration. */ if (asoc->peer.prsctp_capable && sinfo->sinfo_timetolive && (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) || !SCTP_PR_POLICY(sinfo->sinfo_flags))) msg->expires_at = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); /* This is the biggest possible DATA chunk that can fit into * the packet */ max_data = asoc->frag_point; if (unlikely(!max_data)) { max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), sctp_datachk_len(&asoc->stream)); pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%zu)", __func__, asoc, max_data); } /* If the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); if (sinfo->sinfo_tsn && sinfo->sinfo_ssn != asoc->active_key_id) { shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn); if (!shkey) { err = -EINVAL; goto errout; } } else { shkey = asoc->shkey; } } /* Set first_len and then account for possible bundles on first frag */ first_len = max_data; /* Check to see if we have a pending SACK and try to let it be bundled * with this message. Do this if we don't have any data queued already. * To check that, look at out_qlen and retransmit list. * NOTE: we will not reduce to account for SACK, if the message would * not have been fragmented. */ if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max_data) first_len -= SCTP_PAD4(sizeof(struct sctp_sack_chunk)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; /* Account for a different sized first fragment */ if (msg_len >= first_len) { msg->can_delay = 0; if (msg_len > first_len) SCTP_INC_STATS(asoc->base.net, SCTP_MIB_FRAGUSRMSGS); } else { /* Which may be the only one... */ first_len = msg_len; } /* Create chunks for all DATA chunks. */ for (remaining = msg_len; remaining; remaining -= len) { u8 frag = SCTP_DATA_MIDDLE_FRAG; if (remaining == msg_len) { /* First frag, which may also be the last */ frag |= SCTP_DATA_FIRST_FRAG; len = first_len; } else { /* Middle frags */ len = max_data; } if (len >= remaining) { /* Last frag, which may also be the first */ len = remaining; frag |= SCTP_DATA_LAST_FRAG; /* The application requests to set the I-bit of the * last DATA chunk of a user message when providing * the user message to the SCTP implementation. */ if ((sinfo->sinfo_flags & SCTP_EOF) || (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) frag |= SCTP_DATA_SACK_IMM; } chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag, GFP_KERNEL); if (!chunk) { err = -ENOMEM; goto errout; } err = sctp_user_addto_chunk(chunk, len, from); if (err < 0) goto errout_chunk_free; chunk->shkey = shkey; /* Put the chunk->skb back into the form expected by send. */ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - chunk->skb->data); sctp_datamsg_assign(msg, chunk); list_add_tail(&chunk->frag_list, &msg->chunks); } return msg; errout_chunk_free: sctp_chunk_free(chunk); errout: list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); sctp_chunk_free(chunk); } sctp_datamsg_put(msg); return ERR_PTR(err); } /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { if (!chunk->asoc->peer.prsctp_capable) return 0; if (chunk->msg->abandoned) return 1; if (!chunk->has_tsn && !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)) return 0; if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } chunk->msg->abandoned = 1; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; chunk->msg->abandoned = 1; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && time_after(jiffies, chunk->msg->expires_at)) { chunk->msg->abandoned = 1; return 1; } /* PRIO policy is processed by sendmsg, not here */ return 0; } /* This chunk (and consequently entire message) has failed in its sending. */ void sctp_chunk_fail(struct sctp_chunk *chunk, int error) { chunk->msg->send_failed = 1; chunk->msg->send_error = error; }
3 1 2 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * * Fixes: * Alan Cox : Commented a couple of minor bits of surplus code * Alan Cox : Undefining IP_FORWARD doesn't include the code * (just stops a compiler warning). * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes * are junked rather than corrupting things. * Alan Cox : Frames to bad broadcast subnets are dumped * We used to process them non broadcast and * boy could that cause havoc. * Alan Cox : ip_forward sets the free flag on the * new frame it queues. Still crap because * it copies the frame but at least it * doesn't eat memory too. * Alan Cox : Generic queue code and memory fixes. * Fred Van Kempen : IP fragment support (borrowed from NET2E) * Gerhard Koerting: Forward fragmented frames correctly. * Gerhard Koerting: Fixes to my fix of the above 8-). * Gerhard Koerting: IP interface addressing fix. * Linus Torvalds : More robustness checks * Alan Cox : Even more checks: Still not as robust as it ought to be * Alan Cox : Save IP header pointer for later * Alan Cox : ip option setting * Alan Cox : Use ip_tos/ip_ttl settings * Alan Cox : Fragmentation bogosity removed * (Thanks to Mark.Bush@prg.ox.ac.uk) * Dmitry Gorodchanin : Send of a raw packet crash fix. * Alan Cox : Silly ip bug when an overlength * fragment turns up. Now frees the * queue. * Linus Torvalds/ : Memory leakage on fragmentation * Alan Cox : handling. * Gerhard Koerting: Forwarding uses IP priority hints * Teemu Rantanen : Fragment problems. * Alan Cox : General cleanup, comments and reformat * Alan Cox : SNMP statistics * Alan Cox : BSD address rule semantics. Also see * UDP as there is a nasty checksum issue * if you do things the wrong way. * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file * Alan Cox : IP options adjust sk->priority. * Pedro Roque : Fix mtu/length error in ip_forward. * Alan Cox : Avoid ip_chk_addr when possible. * Richard Underwood : IP multicasting. * Alan Cox : Cleaned up multicast handlers. * Alan Cox : RAW sockets demultiplex in the BSD style. * Gunther Mayer : Fix the SNMP reporting typo * Alan Cox : Always in group 224.0.0.1 * Pauline Middelink : Fast ip_checksum update when forwarding * Masquerading support. * Alan Cox : Multicast loopback error for 224.0.0.1 * Alan Cox : IP_MULTICAST_LOOP option. * Alan Cox : Use notifiers. * Bjorn Ekwall : Removed ip_csum (from slhc.c too) * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) * Stefan Becker : Send out ICMP HOST REDIRECT * Arnt Gulbrandsen : ip_build_xmit * Alan Cox : Per socket routing cache * Alan Cox : Fixed routing cache, added header cache. * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. * Alan Cox : Incoming IP option handling. * Alan Cox : Set saddr on raw output frames as per BSD. * Alan Cox : Stopped broadcast source route explosions. * Alan Cox : Can disable source routing * Takeshi Sone : Masquerading didn't work. * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. * Alan Cox : Memory leaks, tramples, misc debugging. * Alan Cox : Fixed multicast (by popular demand 8)) * Alan Cox : Fixed forwarding (by even more popular demand 8)) * Alan Cox : Fixed SNMP statistics [I think] * Gerhard Koerting : IP fragmentation forwarding fix * Alan Cox : Device lock against page fault. * Alan Cox : IP_HDRINCL facility. * Werner Almesberger : Zero fragment bug * Alan Cox : RAW IP frame length bug * Alan Cox : Outgoing firewall on build_xmit * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel * Alan Cox : Multicast routing hooks * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support * * To Fix: * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient * and could be made very efficient with the addition of some virtual memory hacks to permit * the allocation of a buffer that can then be 'grown' by twiddling page tables. * Output fragmentation wants updating along with the buffer management to use a single * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause * fragmentation anyway. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/indirect_call_wrapper.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> #include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> #include <net/dst_metadata.h> /* * Process Router Attention IP option (RFC 2113) */ bool ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; struct net *net = dev_net(dev); for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report * the packet if it came from that interface. */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) raw_rcv(last, skb2); } last = sk; } } if (last) { raw_rcv(last, skb); return true; } return false; } INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { const struct net_protocol *ipprot; int raw, ret; resubmit: raw = raw_local_deliver(skb, protocol); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot) { if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb(skb); return; } nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); if (ret < 0) { protocol = -ret; goto resubmit; } __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb(skb); } else { __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } } static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; } /* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } EXPORT_SYMBOL(ip_local_deliver); static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt; const struct iphdr *iph; /* It looks as overkill, because not all IP options require packet mangling. But it is the easiest for now, especially taking into account that combination of IP options and running sniffer is extremely rare condition. --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } if (unlikely(opt->srr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev)) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); goto drop; } } if (ip_options_rcv_srr(skb, dev)) goto drop; } return false; drop: return true; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, const struct sk_buff *hint) { return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && ip_hdr(hint)->tos == iph->tos; } int tcp_v4_early_demux(struct sk_buff *skb); int udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int err, drop_reason; struct rtable *rt; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (ip_can_use_hint(skb, iph, hint)) { err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, dev, hint); if (unlikely(err)) goto drop_error; } if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_TCP: if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { tcp_v4_early_demux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; case IPPROTO_UDP: if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { err = udp_v4_early_demux(skb); if (unlikely(err)) goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; } } /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, dev); if (unlikely(err)) goto drop_error; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len; } #endif if (iph->ihl > 5 && ip_rcv_options(skb, dev)) goto drop; rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * * When a host sends a datagram to a link-layer broadcast * address, the IP destination address MUST be a legal IP * broadcast or IP multicast address. * * A host SHOULD silently discard a datagram that is received * via a link-layer broadcast (see Section 2.4) but does not * specify an IP multicast or broadcast destination address. * * This doesn't explicitly say L2 *broadcast*, but broadcast is * in a way a form of multicast and the most common use case for * this is 802.11 protecting against cross-station spoofing (the * so-called "hole-196" attack) so do it for both. */ if (in_dev && IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) { drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST; goto drop; } } return NET_RX_SUCCESS; drop: kfree_skb_reason(skb, drop_reason); return NET_RX_DROP; drop_error: if (err == -EXDEV) { drop_reason = SKB_DROP_REASON_IP_RPFILTER; __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); } goto drop; } static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) return NET_RX_SUCCESS; ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; } /* * Main IP Receive routine. */ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; int drop_reason; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) { drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; } __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. * * Is the datagram acceptable? * * 1. Length at least the size of an ip header * 2. Version of 4 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] * 4. Doesn't have a bogus length */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); __IP_ADD_STATS(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = ntohs(iph->tot_len); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ if (!skb_sk_is_prefetched(skb)) skb_orphan(skb); return skb; csum_error: drop_reason = SKB_DROP_REASON_IP_CSUM; __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED) drop_reason = SKB_DROP_REASON_IP_INHDR; __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb_reason(skb, drop_reason); out: return NULL; } /* * IP receive entry point */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(dev); skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); } static void ip_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); dst_input(skb); } } static struct sk_buff *ip_extract_route_hint(const struct net *net, struct sk_buff *skb, int rt_type) { if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) return NULL; return skb; } static void ip_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; struct list_head sublist; INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) continue; if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, ((struct rtable *)dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dst = dst; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ ip_sublist_rcv_finish(&sublist); } static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, struct net *net) { NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); ip_list_rcv_finish(net, NULL, head); } /* Receive a list of IP packets */ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev) { struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; if (curr_dev != dev || curr_net != net) { /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dev = dev; curr_net = net; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); }
77 133 133 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6 * * Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> #include <net/ipv6.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("ip6tables mangle table"); #define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ (1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_MANGLE, }; static unsigned int ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) { unsigned int ret; struct in6_addr saddr, daddr; u_int8_t hop_limit; u_int32_t flowlabel, mark; int err; /* save source/dest address, mark, hoplimit, flowlabel, priority, */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); mark = skb->mark; hop_limit = ipv6_hdr(skb)->hop_limit; /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); ret = ip6t_do_table(skb, state, priv); if (ret != NF_DROP && ret != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { err = ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } return ret; } /* The work comes in here from netfilter.c. */ static unsigned int ip6table_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) return ip6t_mangle_out(skb, state, priv); return ip6t_do_table(skb, state, priv); } static struct nf_hook_ops *mangle_ops __read_mostly; static int ip6table_mangle_table_init(struct net *net) { struct ip6t_replace *repl; int ret; repl = ip6t_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { ip6t_unregister_table_pre_exit(net, "mangle"); } static void __net_exit ip6table_mangle_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "mangle"); } static struct pernet_operations ip6table_mangle_net_ops = { .pre_exit = ip6table_mangle_net_pre_exit, .exit = ip6table_mangle_net_exit, }; static int __init ip6table_mangle_init(void) { int ret = xt_register_template(&packet_mangler, ip6table_mangle_table_init); if (ret < 0) return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); if (IS_ERR(mangle_ops)) { xt_unregister_template(&packet_mangler); return PTR_ERR(mangle_ops); } ret = register_pernet_subsys(&ip6table_mangle_net_ops); if (ret < 0) { xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } return ret; } static void __exit ip6table_mangle_fini(void) { unregister_pernet_subsys(&ip6table_mangle_net_ops); xt_unregister_template(&packet_mangler); kfree(mangle_ops); } module_init(ip6table_mangle_init); module_exit(ip6table_mangle_fini);
856 857 718 150 150 856 857 851 841 856 814 170 799 850 846 846 849 17 850 845 186 806 857 849 857 856 857 564 150 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 // SPDX-License-Identifier: GPL-2.0 /* * kernel userspace event delivery * * Copyright (C) 2004 Red Hat, Inc. All rights reserved. * Copyright (C) 2004 Novell, Inc. All rights reserved. * Copyright (C) 2004 IBM, Inc. All rights reserved. * * Authors: * Robert Love <rml@novell.com> * Kay Sievers <kay.sievers@vrfy.org> * Arjan van de Ven <arjanv@redhat.com> * Greg Kroah-Hartman <greg@kroah.com> */ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/kobject.h> #include <linux/export.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/uidgid.h> #include <linux/uuid.h> #include <linux/ctype.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> u64 uevent_seqnum; #ifdef CONFIG_UEVENT_HELPER char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH; #endif struct uevent_sock { struct list_head list; struct sock *sk; }; #ifdef CONFIG_NET static LIST_HEAD(uevent_sock_list); #endif /* This lock protects uevent_seqnum and uevent_sock_list */ static DEFINE_MUTEX(uevent_sock_mutex); /* the strings here must match the enum in include/linux/kobject.h */ static const char *kobject_actions[] = { [KOBJ_ADD] = "add", [KOBJ_REMOVE] = "remove", [KOBJ_CHANGE] = "change", [KOBJ_MOVE] = "move", [KOBJ_ONLINE] = "online", [KOBJ_OFFLINE] = "offline", [KOBJ_BIND] = "bind", [KOBJ_UNBIND] = "unbind", }; static int kobject_action_type(const char *buf, size_t count, enum kobject_action *type, const char **args) { enum kobject_action action; size_t count_first; const char *args_start; int ret = -EINVAL; if (count && (buf[count-1] == '\n' || buf[count-1] == '\0')) count--; if (!count) goto out; args_start = strnchr(buf, count, ' '); if (args_start) { count_first = args_start - buf; args_start = args_start + 1; } else count_first = count; for (action = 0; action < ARRAY_SIZE(kobject_actions); action++) { if (strncmp(kobject_actions[action], buf, count_first) != 0) continue; if (kobject_actions[action][count_first] != '\0') continue; if (args) *args = args_start; *type = action; ret = 0; break; } out: return ret; } static const char *action_arg_word_end(const char *buf, const char *buf_end, char delim) { const char *next = buf; while (next <= buf_end && *next != delim) if (!isalnum(*next++)) return NULL; if (next == buf) return NULL; return next; } static int kobject_action_args(const char *buf, size_t count, struct kobj_uevent_env **ret_env) { struct kobj_uevent_env *env = NULL; const char *next, *buf_end, *key; int key_len; int r = -EINVAL; if (count && (buf[count - 1] == '\n' || buf[count - 1] == '\0')) count--; if (!count) return -EINVAL; env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return -ENOMEM; /* first arg is UUID */ if (count < UUID_STRING_LEN || !uuid_is_valid(buf) || add_uevent_var(env, "SYNTH_UUID=%.*s", UUID_STRING_LEN, buf)) goto out; /* * the rest are custom environment variables in KEY=VALUE * format with ' ' delimiter between each KEY=VALUE pair */ next = buf + UUID_STRING_LEN; buf_end = buf + count - 1; while (next <= buf_end) { if (*next != ' ') goto out; /* skip the ' ', key must follow */ key = ++next; if (key > buf_end) goto out; buf = next; next = action_arg_word_end(buf, buf_end, '='); if (!next || next > buf_end || *next != '=') goto out; key_len = next - buf; /* skip the '=', value must follow */ if (++next > buf_end) goto out; buf = next; next = action_arg_word_end(buf, buf_end, ' '); if (!next) goto out; if (add_uevent_var(env, "SYNTH_ARG_%.*s=%.*s", key_len, key, (int) (next - buf), buf)) goto out; } r = 0; out: if (r) kfree(env); else *ret_env = env; return r; } /** * kobject_synth_uevent - send synthetic uevent with arguments * * @kobj: struct kobject for which synthetic uevent is to be generated * @buf: buffer containing action type and action args, newline is ignored * @count: length of buffer * * Returns 0 if kobject_synthetic_uevent() is completed with success or the * corresponding error when it fails. */ int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count) { char *no_uuid_envp[] = { "SYNTH_UUID=0", NULL }; enum kobject_action action; const char *action_args; struct kobj_uevent_env *env; const char *msg = NULL, *devpath; int r; r = kobject_action_type(buf, count, &action, &action_args); if (r) { msg = "unknown uevent action string"; goto out; } if (!action_args) { r = kobject_uevent_env(kobj, action, no_uuid_envp); goto out; } r = kobject_action_args(action_args, count - (action_args - buf), &env); if (r == -EINVAL) { msg = "incorrect uevent action arguments"; goto out; } if (r) goto out; r = kobject_uevent_env(kobj, action, env->envp); kfree(env); out: if (r) { devpath = kobject_get_path(kobj, GFP_KERNEL); pr_warn("synth uevent: %s: %s\n", devpath ?: "unknown device", msg ?: "failed to send uevent"); kfree(devpath); } return r; } #ifdef CONFIG_UEVENT_HELPER static int kobj_usermode_filter(struct kobject *kobj) { const struct kobj_ns_type_operations *ops; ops = kobj_ns_ops(kobj); if (ops) { const void *init_ns, *ns; ns = kobj->ktype->namespace(kobj); init_ns = ops->initial_ns(); return ns != init_ns; } return 0; } static int init_uevent_argv(struct kobj_uevent_env *env, const char *subsystem) { int buffer_size = sizeof(env->buf) - env->buflen; int len; len = strlcpy(&env->buf[env->buflen], subsystem, buffer_size); if (len >= buffer_size) { pr_warn("init_uevent_argv: buffer size of %d too small, needed %d\n", buffer_size, len); return -ENOMEM; } env->argv[0] = uevent_helper; env->argv[1] = &env->buf[env->buflen]; env->argv[2] = NULL; env->buflen += len + 1; return 0; } static void cleanup_uevent_env(struct subprocess_info *info) { kfree(info->data); } #endif #ifdef CONFIG_NET static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env, const char *action_string, const char *devpath) { struct netlink_skb_parms *parms; struct sk_buff *skb = NULL; char *scratch; size_t len; /* allocate message with maximum possible size */ len = strlen(action_string) + strlen(devpath) + 2; skb = alloc_skb(len + env->buflen, GFP_KERNEL); if (!skb) return NULL; /* add header */ scratch = skb_put(skb, len); sprintf(scratch, "%s@%s", action_string, devpath); skb_put_data(skb, env->buf, env->buflen); parms = &NETLINK_CB(skb); parms->creds.uid = GLOBAL_ROOT_UID; parms->creds.gid = GLOBAL_ROOT_GID; parms->dst_group = 1; parms->portid = 0; return skb; } static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env, const char *action_string, const char *devpath) { struct sk_buff *skb = NULL; struct uevent_sock *ue_sk; int retval = 0; /* send netlink message */ list_for_each_entry(ue_sk, &uevent_sock_list, list) { struct sock *uevent_sock = ue_sk->sk; if (!netlink_has_listeners(uevent_sock, 1)) continue; if (!skb) { retval = -ENOMEM; skb = alloc_uevent_skb(env, action_string, devpath); if (!skb) continue; } retval = netlink_broadcast(uevent_sock, skb_get(skb), 0, 1, GFP_KERNEL); /* ENOBUFS should be handled in userspace */ if (retval == -ENOBUFS || retval == -ESRCH) retval = 0; } consume_skb(skb); return retval; } static int uevent_net_broadcast_tagged(struct sock *usk, struct kobj_uevent_env *env, const char *action_string, const char *devpath) { struct user_namespace *owning_user_ns = sock_net(usk)->user_ns; struct sk_buff *skb = NULL; int ret = 0; skb = alloc_uevent_skb(env, action_string, devpath); if (!skb) return -ENOMEM; /* fix credentials */ if (owning_user_ns != &init_user_ns) { struct netlink_skb_parms *parms = &NETLINK_CB(skb); kuid_t root_uid; kgid_t root_gid; /* fix uid */ root_uid = make_kuid(owning_user_ns, 0); if (uid_valid(root_uid)) parms->creds.uid = root_uid; /* fix gid */ root_gid = make_kgid(owning_user_ns, 0); if (gid_valid(root_gid)) parms->creds.gid = root_gid; } ret = netlink_broadcast(usk, skb, 0, 1, GFP_KERNEL); /* ENOBUFS should be handled in userspace */ if (ret == -ENOBUFS || ret == -ESRCH) ret = 0; return ret; } #endif static int kobject_uevent_net_broadcast(struct kobject *kobj, struct kobj_uevent_env *env, const char *action_string, const char *devpath) { int ret = 0; #ifdef CONFIG_NET const struct kobj_ns_type_operations *ops; const struct net *net = NULL; ops = kobj_ns_ops(kobj); if (!ops && kobj->kset) { struct kobject *ksobj = &kobj->kset->kobj; if (ksobj->parent != NULL) ops = kobj_ns_ops(ksobj->parent); } /* kobjects currently only carry network namespace tags and they * are the only tag relevant here since we want to decide which * network namespaces to broadcast the uevent into. */ if (ops && ops->netlink_ns && kobj->ktype->namespace) if (ops->type == KOBJ_NS_TYPE_NET) net = kobj->ktype->namespace(kobj); if (!net) ret = uevent_net_broadcast_untagged(env, action_string, devpath); else ret = uevent_net_broadcast_tagged(net->uevent_sock->sk, env, action_string, devpath); #endif return ret; } static void zap_modalias_env(struct kobj_uevent_env *env) { static const char modalias_prefix[] = "MODALIAS="; size_t len; int i, j; for (i = 0; i < env->envp_idx;) { if (strncmp(env->envp[i], modalias_prefix, sizeof(modalias_prefix) - 1)) { i++; continue; } len = strlen(env->envp[i]) + 1; if (i != env->envp_idx - 1) { memmove(env->envp[i], env->envp[i + 1], env->buflen - len); for (j = i; j < env->envp_idx - 1; j++) env->envp[j] = env->envp[j + 1] - len; } env->envp_idx--; env->buflen -= len; } } /** * kobject_uevent_env - send an uevent with environmental data * * @kobj: struct kobject that the action is happening to * @action: action that is happening * @envp_ext: pointer to environmental data * * Returns 0 if kobject_uevent_env() is completed with success or the * corresponding error when it fails. */ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp_ext[]) { struct kobj_uevent_env *env; const char *action_string = kobject_actions[action]; const char *devpath = NULL; const char *subsystem; struct kobject *top_kobj; struct kset *kset; const struct kset_uevent_ops *uevent_ops; int i = 0; int retval = 0; /* * Mark "remove" event done regardless of result, for some subsystems * do not want to re-trigger "remove" event via automatic cleanup. */ if (action == KOBJ_REMOVE) kobj->state_remove_uevent_sent = 1; pr_debug("kobject: '%s' (%p): %s\n", kobject_name(kobj), kobj, __func__); /* search the kset we belong to */ top_kobj = kobj; while (!top_kobj->kset && top_kobj->parent) top_kobj = top_kobj->parent; if (!top_kobj->kset) { pr_debug("kobject: '%s' (%p): %s: attempted to send uevent " "without kset!\n", kobject_name(kobj), kobj, __func__); return -EINVAL; } kset = top_kobj->kset; uevent_ops = kset->uevent_ops; /* skip the event, if uevent_suppress is set*/ if (kobj->uevent_suppress) { pr_debug("kobject: '%s' (%p): %s: uevent_suppress " "caused the event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* skip the event, if the filter returns zero. */ if (uevent_ops && uevent_ops->filter) if (!uevent_ops->filter(kset, kobj)) { pr_debug("kobject: '%s' (%p): %s: filter function " "caused the event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* originating subsystem */ if (uevent_ops && uevent_ops->name) subsystem = uevent_ops->name(kset, kobj); else subsystem = kobject_name(&kset->kobj); if (!subsystem) { pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the " "event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* environment buffer */ env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); if (!env) return -ENOMEM; /* complete object path */ devpath = kobject_get_path(kobj, GFP_KERNEL); if (!devpath) { retval = -ENOENT; goto exit; } /* default keys */ retval = add_uevent_var(env, "ACTION=%s", action_string); if (retval) goto exit; retval = add_uevent_var(env, "DEVPATH=%s", devpath); if (retval) goto exit; retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem); if (retval) goto exit; /* keys passed in from the caller */ if (envp_ext) { for (i = 0; envp_ext[i]; i++) { retval = add_uevent_var(env, "%s", envp_ext[i]); if (retval) goto exit; } } /* let the kset specific function add its stuff */ if (uevent_ops && uevent_ops->uevent) { retval = uevent_ops->uevent(kset, kobj, env); if (retval) { pr_debug("kobject: '%s' (%p): %s: uevent() returned " "%d\n", kobject_name(kobj), kobj, __func__, retval); goto exit; } } switch (action) { case KOBJ_ADD: /* * Mark "add" event so we can make sure we deliver "remove" * event to userspace during automatic cleanup. If * the object did send an "add" event, "remove" will * automatically generated by the core, if not already done * by the caller. */ kobj->state_add_uevent_sent = 1; break; case KOBJ_UNBIND: zap_modalias_env(env); break; default: break; } mutex_lock(&uevent_sock_mutex); /* we will send an event, so request a new sequence number */ retval = add_uevent_var(env, "SEQNUM=%llu", ++uevent_seqnum); if (retval) { mutex_unlock(&uevent_sock_mutex); goto exit; } retval = kobject_uevent_net_broadcast(kobj, env, action_string, devpath); mutex_unlock(&uevent_sock_mutex); #ifdef CONFIG_UEVENT_HELPER /* call uevent_helper, usually only enabled during early boot */ if (uevent_helper[0] && !kobj_usermode_filter(kobj)) { struct subprocess_info *info; retval = add_uevent_var(env, "HOME=/"); if (retval) goto exit; retval = add_uevent_var(env, "PATH=/sbin:/bin:/usr/sbin:/usr/bin"); if (retval) goto exit; retval = init_uevent_argv(env, subsystem); if (retval) goto exit; retval = -ENOMEM; info = call_usermodehelper_setup(env->argv[0], env->argv, env->envp, GFP_KERNEL, NULL, cleanup_uevent_env, env); if (info) { retval = call_usermodehelper_exec(info, UMH_NO_WAIT); env = NULL; /* freed by cleanup_uevent_env */ } } #endif exit: kfree(devpath); kfree(env); return retval; } EXPORT_SYMBOL_GPL(kobject_uevent_env); /** * kobject_uevent - notify userspace by sending an uevent * * @kobj: struct kobject that the action is happening to * @action: action that is happening * * Returns 0 if kobject_uevent() is completed with success or the * corresponding error when it fails. */ int kobject_uevent(struct kobject *kobj, enum kobject_action action) { return kobject_uevent_env(kobj, action, NULL); } EXPORT_SYMBOL_GPL(kobject_uevent); /** * add_uevent_var - add key value string to the environment buffer * @env: environment buffer structure * @format: printf format for the key=value pair * * Returns 0 if environment variable was added successfully or -ENOMEM * if no space was available. */ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) { va_list args; int len; if (env->envp_idx >= ARRAY_SIZE(env->envp)) { WARN(1, KERN_ERR "add_uevent_var: too many keys\n"); return -ENOMEM; } va_start(args, format); len = vsnprintf(&env->buf[env->buflen], sizeof(env->buf) - env->buflen, format, args); va_end(args); if (len >= (sizeof(env->buf) - env->buflen)) { WARN(1, KERN_ERR "add_uevent_var: buffer size too small\n"); return -ENOMEM; } env->envp[env->envp_idx++] = &env->buf[env->buflen]; env->buflen += len + 1; return 0; } EXPORT_SYMBOL_GPL(add_uevent_var); #if defined(CONFIG_NET) static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb, struct netlink_ext_ack *extack) { /* u64 to chars: 2^64 - 1 = 21 chars */ char buf[sizeof("SEQNUM=") + 21]; struct sk_buff *skbc; int ret; /* bump and prepare sequence number */ ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu", ++uevent_seqnum); if (ret < 0 || (size_t)ret >= sizeof(buf)) return -ENOMEM; ret++; /* verify message does not overflow */ if ((skb->len + ret) > UEVENT_BUFFER_SIZE) { NL_SET_ERR_MSG(extack, "uevent message too big"); return -EINVAL; } /* copy skb and extend to accommodate sequence number */ skbc = skb_copy_expand(skb, 0, ret, GFP_KERNEL); if (!skbc) return -ENOMEM; /* append sequence number */ skb_put_data(skbc, buf, ret); /* remove msg header */ skb_pull(skbc, NLMSG_HDRLEN); /* set portid 0 to inform userspace message comes from kernel */ NETLINK_CB(skbc).portid = 0; NETLINK_CB(skbc).dst_group = 1; ret = netlink_broadcast(usk, skbc, 0, 1, GFP_KERNEL); /* ENOBUFS should be handled in userspace */ if (ret == -ENOBUFS || ret == -ESRCH) ret = 0; return ret; } static int uevent_net_rcv_skb(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net; int ret; if (!nlmsg_data(nlh)) return -EINVAL; /* * Verify that we are allowed to send messages to the target * network namespace. The caller must have CAP_SYS_ADMIN in the * owning user namespace of the target network namespace. */ net = sock_net(NETLINK_CB(skb).sk); if (!netlink_ns_capable(skb, net->user_ns, CAP_SYS_ADMIN)) { NL_SET_ERR_MSG(extack, "missing CAP_SYS_ADMIN capability"); return -EPERM; } mutex_lock(&uevent_sock_mutex); ret = uevent_net_broadcast(net->uevent_sock->sk, skb, extack); mutex_unlock(&uevent_sock_mutex); return ret; } static void uevent_net_rcv(struct sk_buff *skb) { netlink_rcv_skb(skb, &uevent_net_rcv_skb); } static int uevent_net_init(struct net *net) { struct uevent_sock *ue_sk; struct netlink_kernel_cfg cfg = { .groups = 1, .input = uevent_net_rcv, .flags = NL_CFG_F_NONROOT_RECV }; ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL); if (!ue_sk) return -ENOMEM; ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, &cfg); if (!ue_sk->sk) { pr_err("kobject_uevent: unable to create netlink socket!\n"); kfree(ue_sk); return -ENODEV; } net->uevent_sock = ue_sk; /* Restrict uevents to initial user namespace. */ if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) { mutex_lock(&uevent_sock_mutex); list_add_tail(&ue_sk->list, &uevent_sock_list); mutex_unlock(&uevent_sock_mutex); } return 0; } static void uevent_net_exit(struct net *net) { struct uevent_sock *ue_sk = net->uevent_sock; if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) { mutex_lock(&uevent_sock_mutex); list_del(&ue_sk->list); mutex_unlock(&uevent_sock_mutex); } netlink_kernel_release(ue_sk->sk); kfree(ue_sk); } static struct pernet_operations uevent_net_ops = { .init = uevent_net_init, .exit = uevent_net_exit, }; static int __init kobject_uevent_init(void) { return register_pernet_subsys(&uevent_net_ops); } postcore_initcall(kobject_uevent_init); #endif
24 24 24 24 24 1 6 24 18 24 24 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/eventpoll.c (Efficient event retrieval implementation) * Copyright (C) 2001,...,2009 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/signal.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/string.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/rbtree.h> #include <linux/wait.h> #include <linux/eventpoll.h> #include <linux/mount.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> #include <net/busy_poll.h> /* * LOCKING: * There are three level of locking required by epoll : * * 1) epmutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (rwlock) * * The acquire order is the one listed above, from 1 to 3. * We need a rwlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need * a spinlock. During the event transfer loop (from kernel to * user space) we could end up sleeping due a copy_to_user(), so * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). * Then we also need a global mutex to serialize eventpoll_release_file() * and ep_free(). * This mutex is acquired by ep_free() during the epoll file * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). * It is also acquired when inserting an epoll fd onto another epoll * fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. * It is necessary to acquire multiple "ep->mtx"es at once in the * case when one epoll fd is added to another. In this case, we * always acquire the locks in the order of nesting (i.e. after * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired * before e2->mtx). Since we disallow cycles of epoll file * descriptors, this ensures that the mutexes are well-ordered. In * order to communicate this nesting to lockdep, when walking a tree * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epmutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability. */ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) #define EP_UNACTIVE_PTR ((void *) -1L) #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) struct epoll_filefd { struct file *file; int fd; } __packed; /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ struct eppoll_entry *next; /* The "base" pointer is set to the container "struct epitem" */ struct epitem *base; /* * Wait queue item that will be linked to the target file wait * queue head. */ wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; }; /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line. */ struct epitem { union { /* RB tree node links this structure to the eventpoll RB tree */ struct rb_node rbn; /* Used to free the struct epitem */ struct rcu_head rcu; }; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ struct epitem *next; /* The file descriptor information this item refers to */ struct epoll_filefd ffd; /* List containing poll wait queues */ struct eppoll_entry *pwqlist; /* The "container" of this item */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ struct hlist_node fllink; /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; }; /* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* Lock which protects rdllist and ovflist */ rwlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist; /* wakeup_source used when ep_scan_ready_list is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ struct user_struct *user; struct file *file; /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC /* tracks wakeup nests for lockdep validation */ u8 nests; #endif }; /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; struct epitem *epi; }; /* * Configuration options available inside /proc/sys/fs/epoll/ */ /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; /* * This mutex is used to serialize ep_free() and eventpoll_release_file(). */ static DEFINE_MUTEX(epmutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epmutex. */ struct epitems_head { struct hlist_head epitems; struct epitems_head *next; }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; static struct kmem_cache *ephead_cache __read_mostly; static inline void free_ephead(struct epitems_head *head) { if (head) kmem_cache_free(ephead_cache, head); } static void list_file(struct file *file) { struct epitems_head *head; head = container_of(file->f_ep, struct epitems_head, epitems); if (!head->next) { head->next = tfile_check_list; tfile_check_list = head; } } static void unlist_file(struct epitems_head *head) { struct epitems_head *to_free = head; struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); if (p) { struct epitem *epi= container_of(p, struct epitem, fllink); spin_lock(&epi->ffd.file->f_lock); if (!hlist_empty(&head->epitems)) to_free = NULL; head->next = NULL; spin_unlock(&epi->ffd.file->f_lock); } free_ephead(to_free); } #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long long_zero; static long long_max = LONG_MAX; struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &long_zero, .extra2 = &long_max, }, { } }; #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; } /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, struct file *file, int fd) { ffd->file = file; ffd->fd = fd; } /* Compare RB tree keys */ static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2) { return (p1->file > p2->file ? +1: (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } /* Tells us if the item is currently linked */ static inline int ep_is_linked(struct epitem *epi) { return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } /** * ep_events_available - Checks if ready events might be available. * * @ep: Pointer to the eventpoll context. * * Return: a value different than %zero if ready events are available, * or %zero otherwise. */ static inline int ep_events_available(struct eventpoll *ep) { return !list_empty_careful(&ep->rdllist) || READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; return ep_events_available(ep) || busy_loop_timeout(start_time); } /* * Busy poll if globally on and supporting sockets found && no events, * busy loop will return if need_resched or ep_events_available. * * we must do our busy polling with irqs enabled */ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) { napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, BUSY_POLL_BUDGET); if (ep_events_available(ep)) return true; /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ ep->napi_id = 0; return false; } return false; } /* * Set epoll busy poll NAPI ID from sk. */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { struct eventpoll *ep; unsigned int napi_id; struct socket *sock; struct sock *sk; if (!net_busy_loop_on()) return; sock = sock_from_file(epi->ffd.file); if (!sock) return; sk = sock->sk; if (!sk) return; napi_id = READ_ONCE(sk->sk_napi_id); ep = epi->ep; /* Non-NAPI IDs can be rejected * or * Nothing to do if we already have this ID */ if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; } #else static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) { return false; } static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } #endif /* CONFIG_NET_RX_BUSY_POLL */ /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled * manner. Wake ups can nest inside each other, but are never done * with the same locking. For example: * * dfd = socket(...); * efd1 = epoll_create(); * efd2 = epoll_create(); * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); * * When a packet arrives to the device underneath "dfd", the net code will * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a * callback wakeup entry on that queue, and the wake_up() performed by the * "dfd" net code will end up in ep_poll_callback(). At this point epoll * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to * avoid stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, unsigned pollflags) { struct eventpoll *ep_src; unsigned long flags; u8 nests = 0; /* * To set the subclass or nesting level for spin_lock_irqsave_nested() * it might be natural to create a per-cpu nest count. However, since * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can * schedule() in the -rt kernel, the per-cpu variable are no longer * protected. Thus, we are introducing a per eventpoll nest field. * If we are not being call from ep_poll_callback(), epi is NULL and * we are at the first level of nesting, 0. Otherwise, we are being * called from ep_poll_callback() and if a previous wakeup source is * not an epoll file itself, we are at depth 1 since the wakeup source * is depth 0. If the wakeup source is a previous epoll file in the * wakeup chain then we use its nests value and record ours as * nests + 1. The previous epoll file nests value is stable since its * already holding its own poll_wait.lock. */ if (epi) { if ((is_file_epoll(epi->ffd.file))) { ep_src = epi->ffd.file->private_data; nests = ep_src->nests; } else { nests = 1; } } spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); ep->nests = nests + 1; wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); ep->nests = 0; spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, unsigned pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; rcu_read_lock(); /* * If it is cleared by POLLFREE, it should be rcu-safe. * If we read NULL we need a barrier paired with * smp_store_release() in ep_poll_callback(), otherwise * we rely on whead->lock. */ whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); } /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held (or "epmutex" if called from * ep_free). */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq; while ((pwq = *p) != NULL) { *p = pwq->next; ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } /* call only when ep->mtx is held */ static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) { return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); } /* call only when ep->mtx is held */ static inline void ep_pm_stay_awake(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); if (ws) __pm_stay_awake(ws); } static inline bool ep_has_wakeup_source(struct epitem *epi) { return rcu_access_pointer(epi->ws) ? true : false; } /* call when ep->mtx cannot be held (ep_poll_callback) */ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) { struct wakeup_source *ws; rcu_read_lock(); ws = rcu_dereference(epi->ws); if (ws) __pm_stay_awake(ws); rcu_read_unlock(); } /* * ep->mutex needs to be held because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) { /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ lockdep_assert_irqs_enabled(); write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); write_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, struct list_head *txlist) { struct epitem *epi, *nepi; write_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { /* * ->ovflist is LIFO, so we have to reverse it in order * to keep in FIFO. */ list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". */ list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); } write_unlock_irq(&ep->lock); } static void epi_rcu_free(struct rcu_head *head) { struct epitem *epi = container_of(head, struct epitem, rcu); kmem_cache_free(epi_cache, epi); } /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. */ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { struct file *file = epi->ffd.file; struct epitems_head *to_free; struct hlist_head *head; lockdep_assert_irqs_enabled(); /* * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { file->f_ep = NULL; if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); if (!smp_load_acquire(&v->next)) to_free = v; } } hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); free_ephead(to_free); rb_erase_cached(&epi->rbn, &ep->rbr); write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* * At this point it is safe to free the eventpoll item. Use the union * field epi->rcu, since we are trying to minimize the size of * 'struct epitem'. The 'rbn' field is no longer in use. Protected by * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ call_rcu(&epi->rcu, epi_rcu_free); percpu_counter_dec(&ep->user->epoll_watches); return 0; } static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; struct epitem *epi; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); /* * We need to lock this because we could be hit by * eventpoll_release_file() while we're freeing the "struct eventpoll". * We do not need to hold "ep->mtx" here because the epoll file * is on the way to be removed and no one has references to it * anymore. The only hit might come from eventpoll_release_file() but * holding "epmutex" is sufficient here. */ mutex_lock(&epmutex); /* * Walks through the whole tree by unregistering poll callbacks. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); cond_resched(); } /* * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". * We do not need to lock ep->mtx, either, we only do it to prevent * a lockdep warning. */ mutex_lock(&ep->mtx); while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); cond_resched(); } mutex_unlock(&ep->mtx); mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); kfree(ep); } static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; if (ep) ep_free(ep); return 0; } static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { struct eventpoll *ep = file->private_data; LIST_HEAD(txlist); struct epitem *epi, *tmp; poll_table pt; __poll_t res = 0; init_poll_funcptr(&pt, NULL); /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside * the ready list. */ mutex_lock_nested(&ep->mtx, depth); ep_start_scan(ep, &txlist); list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { if (ep_item_poll(epi, &pt, depth + 1)) { res = EPOLLIN | EPOLLRDNORM; break; } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } /* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. */ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { struct file *file = epi->ffd.file; __poll_t res; pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); return res & epi->event.events; } static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { return __ep_eventpoll_poll(file, wait, 0); } #ifdef CONFIG_PROC_FS static void ep_show_fdinfo(struct seq_file *m, struct file *f) { struct eventpoll *ep = f->private_data; struct rb_node *rbp; mutex_lock(&ep->mtx); for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " " pos:%lli ino:%lx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, inode->i_ino, inode->i_sb->s_dev); if (seq_has_overflowed(m)) break; } mutex_unlock(&ep->mtx); } #endif /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, }; /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are * closed without being removed from the eventpoll interface. */ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; struct hlist_node *next; /* * We don't want to get "file->f_lock" because it is not * necessary. It is not necessary because we're in the "struct file" * cleanup path, and this means that no one is using this file anymore. * So, for example, epoll_ctl() cannot hit here since if we reach this * point, the file counter already went to zero and fget() would fail. * The only hit might come from ep_free() but by holding the mutex * will correctly serialize the operation. We do need to acquire * "ep->mtx" after "epmutex" because ep_remove() requires it when called * from anywhere but ep_free(). * * Besides, ep_remove() acquires the lock, so we can't hold it here. */ mutex_lock(&epmutex); if (unlikely(!file->f_ep)) { mutex_unlock(&epmutex); return; } hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) { ep = epi->ep; mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); mutex_unlock(&ep->mtx); } mutex_unlock(&epmutex); } static int ep_alloc(struct eventpoll **pep) { int error; struct user_struct *user; struct eventpoll *ep; user = get_current_user(); error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) goto free_uid; mutex_init(&ep->mtx); rwlock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = user; *pep = ep; return 0; free_uid: free_uid(user); return error; } /* * Search the file inside the eventpoll tree. The RB tree operations * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { epir = epi; break; } } return epir; } #ifdef CONFIG_KCMP static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) { struct rb_node *rbp; struct epitem *epi; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) return epi; else toff--; } cond_resched(); } return NULL; } struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff) { struct file *file_raw; struct eventpoll *ep; struct epitem *epi; if (!is_file_epoll(file)) return ERR_PTR(-EINVAL); ep = file->private_data; mutex_lock(&ep->mtx); epi = ep_find_tfd(ep, tfd, toff); if (epi) file_raw = epi->ffd.file; else file_raw = ERR_PTR(-ENOENT); mutex_unlock(&ep->mtx); return file_raw; } #endif /* CONFIG_KCMP */ /* * Adds a new entry to the tail of the list in a lockless way, i.e. * multiple CPUs are allowed to call this function concurrently. * * Beware: it is necessary to prevent any other modifications of the * existing list until all changes are completed, in other words * concurrent list_add_tail_lockless() calls should be protected * with a read lock, where write lock acts as a barrier which * makes sure all list_add_tail_lockless() calls are fully * completed. * * Also an element can be locklessly added to the list only in one * direction i.e. either to the tail or to the head, otherwise * concurrent access will corrupt the list. * * Return: %false if element has been already added to the list, %true * otherwise. */ static inline bool list_add_tail_lockless(struct list_head *new, struct list_head *head) { struct list_head *prev; /* * This is simple 'new->next = head' operation, but cmpxchg() * is used in order to detect that same element has been just * added to the list from another CPU: the winner observes * new->next == new. */ if (cmpxchg(&new->next, new, head) != new) return false; /* * Initially ->next of a new element must be updated with the head * (we are inserting to the tail) and only then pointers are atomically * exchanged. XCHG guarantees memory ordering, thus ->next should be * updated before pointers are actually swapped and pointers are * swapped before prev->next is updated. */ prev = xchg(&head->prev, new); /* * It is safe to modify prev->next and new->prev, because a new element * is added only to the tail and new->next is updated before XCHG. */ prev->next = new; new->prev = prev; return true; } /* * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, * i.e. multiple CPUs are allowed to call this function concurrently. * * Return: %false if epi element has been already chained, %true otherwise. */ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; /* Fast preliminary check */ if (epi->next != EP_UNACTIVE_PTR) return false; /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; /* Atomically exchange tail */ epi->next = xchg(&ep->ovflist, epi); return true; } /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. * * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from * ep_scan_ready_list(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called * concurrently for the same @epi from different CPUs if poll table was inited * with several wait queues entries. Plural wakeup from different CPUs of a * single wait queue is serialized by wq.lock, but the case when multiple wait * queues are used should be detected accordingly. This is detected using * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); unsigned long flags; int ewake = 0; read_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; /* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (chain_epi_lockless(epi)) ep_pm_stay_awake_rcu(epi); } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !(pollflags & POLLFREE)) { switch (pollflags & EPOLLINOUT_BITS) { case EPOLLIN: if (epi->event.events & EPOLLIN) ewake = 1; break; case EPOLLOUT: if (epi->event.events & EPOLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: read_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after * us, so we can't use __remove_wait_queue(). */ list_del_init(&wait->entry); /* * ->whead != NULL protects us from the race with ep_free() * or ep_remove(), ep_remove_wait_queue() takes whead->lock * held by the caller. Once we nullify it, nothing protects * ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } return ewake; } /* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); struct epitem *epi = epq->epi; struct eppoll_entry *pwq; if (unlikely(!epi)) // an earlier allocation has failed return; pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); if (unlikely(!pwq)) { epq->epi = NULL; return; } init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); pwq->next = epi->pwqlist; epi->pwqlist = pwq; } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) { p = &parent->rb_right; leftmost = false; } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate * from a single file of interest. For example, we allow 1000 paths of length * 1, to emanate from each file of interest. This essentially represents the * potential wakeup paths, which need to be limited in order to avoid massive * uncontrolled wakeup storms. The common use case should be a single ep which * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify * and delete can't add additional paths. Protected by the epmutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; static int path_count[PATH_ARR_SIZE]; static int path_count_inc(int nests) { /* Allow an arbitrary number of depth 1 paths */ if (nests == 0) return 0; if (++path_count[nests] > path_limits[nests]) return -1; return 0; } static void path_count_init(void) { int i; for (i = 0; i < PATH_ARR_SIZE; i++) path_count[i] = 0; } static int reverse_path_check_proc(struct hlist_head *refs, int depth) { int error = 0; struct epitem *epi; if (depth > EP_MAX_NESTS) /* too deep nesting */ return -1; /* CTL_DEL can remove links here, but that can't increase our count */ hlist_for_each_entry_rcu(epi, refs, fllink) { struct hlist_head *refs = &epi->ep->refs; if (hlist_empty(refs)) error = path_count_inc(depth); else error = reverse_path_check_proc(refs, depth + 1); if (error != 0) break; } return error; } /** * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up * eventpoll objects. * * Return: %zero if the proposed links don't create too many paths, * %-1 otherwise. */ static int reverse_path_check(void) { struct epitems_head *p; for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { int error; path_count_init(); rcu_read_lock(); error = reverse_path_check_proc(&p->epitems, 0); rcu_read_unlock(); if (error) return error; } return 0; } static int ep_create_wakeup_source(struct epitem *epi) { struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); if (!epi->ep->ws) return -ENOMEM; } take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); ws = wakeup_source_register(NULL, n.name.name); release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; rcu_assign_pointer(epi->ws, ws); return 0; } /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ static noinline void ep_destroy_wakeup_source(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); RCU_INIT_POINTER(epi->ws, NULL); /* * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is * used internally by wakeup_source_remove, too (called by * wakeup_source_unregister), so we cannot use call_rcu */ synchronize_rcu(); wakeup_source_unregister(ws); } static int attach_epitem(struct file *file, struct epitem *epi) { struct epitems_head *to_free = NULL; struct hlist_head *head = NULL; struct eventpoll *ep = NULL; if (is_file_epoll(file)) ep = file->private_data; if (ep) { head = &ep->refs; } else if (!READ_ONCE(file->f_ep)) { allocate: to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); if (!to_free) return -ENOMEM; head = &to_free->epitems; } spin_lock(&file->f_lock); if (!file->f_ep) { if (unlikely(!head)) { spin_unlock(&file->f_lock); goto allocate; } file->f_ep = head; to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); spin_unlock(&file->f_lock); free_ephead(to_free); return 0; } /* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check) { int error, pwake = 0; __poll_t revents; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; if (is_file_epoll(tfile)) tep = tfile->private_data; lockdep_assert_irqs_enabled(); if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, max_user_watches) >= 0)) return -ENOSPC; percpu_counter_inc(&ep->user->epoll_watches); if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->next = EP_UNACTIVE_PTR; if (tep) mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { if (tep) mutex_unlock(&tep->mtx); kmem_cache_free(epi_cache, epi); percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); if (tep) mutex_unlock(&tep->mtx); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { ep_remove(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { ep_remove(ep, epi); return error; } } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ if (unlikely(!epq.epi)) { ep_remove(ep, epi); return -ENOMEM; } /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } /* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, const struct epoll_event *event) { int pwake = 0; poll_table pt; lockdep_assert_irqs_enabled(); init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } /* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item. */ smp_mb(); /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { write_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct epitem *epi, *tmp; LIST_HEAD(txlist); poll_table pt; int res = 0; /* * Always short-circuit for fatal signals to allow threads to make a * timely exit without the chance of finding more events available and * fetching repeatedly. */ if (fatal_signal_pending(current)) return -EINTR; init_poll_funcptr(&pt, NULL); mutex_lock(&ep->mtx); ep_start_scan(ep, &txlist); /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop we are holding ep->mtx. */ list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { struct wakeup_source *ws; __poll_t revents; if (res >= maxevents) break; /* * Activate ep->ws before deactivating epi->ws to prevent * triggering auto-suspend here (in case we reactive epi->ws * below). * * This could be rearranged to delay the deactivation of epi->ws * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ ws = ep_wakeup_source(epi); if (ws) { if (ws->active) __pm_stay_awake(ep->ws); __pm_relax(ws); } list_del_init(&epi->rdllink); /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding ep->mtx, * so no operations coming from userspace can change the item. */ revents = ep_item_poll(epi, &pt, 1); if (!revents) continue; events = epoll_put_uevent(revents, epi->event.data, events); if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) res = -EFAULT; break; } res++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_scan_ready_list() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) { struct timespec64 now; if (ms < 0) return NULL; if (!ms) { to->tv_sec = 0; to->tv_nsec = 0; return to; } to->tv_sec = ms / MSEC_PER_SEC; to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC); ktime_get_ts64(&now); *to = timespec64_add_safe(now, *to); return to; } /* * autoremove_wake_function, but remove even on failure to wake up, because we * know that default_wake_function/ttwu will only fail if the thread is already * woken, and in that case the ep_poll loop will remove the entry anyways, not * try to reuse it. */ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wq_entry, mode, sync, key); list_del_init(&wq_entry->entry); return ret; } /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. * @events: Pointer to the userspace buffer where the ready events should be * stored. * @maxevents: Size (in terms of number of events) of the caller event buffer. * @timeout: Maximum timeout for the ready events fetch operation, in * timespec. If the timeout is zero, the function will not block, * while if the @timeout ptr is NULL, the function will block * until at least one event has been retrieved (or an error * occurred). * * Return: the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout) { int res, eavail, timed_out = 0; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; lockdep_assert_irqs_enabled(); if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { slack = select_estimate_accuracy(timeout); to = &expires; *to = timespec64_to_ktime(*timeout); } else if (timeout) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation. */ timed_out = 1; } /* * This call is racy: We may or may not see events that are being added * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again. */ eavail = ep_events_available(ep); while (1) { if (eavail) { /* * Try to transfer events to user space. In case we get * 0 events and there's still timeout left over, we go * trying again in search of more luck. */ res = ep_send_events(ep, events, maxevents); if (res) return res; } if (timed_out) return 0; eavail = ep_busy_loop(ep, timed_out); if (eavail) continue; if (signal_pending(current)) return -EINTR; /* * Internally init_wait() uses autoremove_wake_function(), * thus wait entry is removed from the wait queue on each * wakeup. Why it is important? In case of several waiters * each new wakeup will hit the next waiter, giving it the * chance to harvest new event. Otherwise wakeup can be * lost. This is also good performance-wise, because on * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. * * In fact, we now use an even more aggressive function that * unconditionally removes, because we don't reuse the wait * entry between loop iterations. This lets us also avoid the * performance issue if a process is killed, causing all of its * threads to wake up without being removed normally. */ init_wait(&wait); wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it * is safe to avoid an explicit barrier. */ __set_current_state(TASK_INTERRUPTIBLE); /* * Do the final check under the lock. ep_scan_ready_list() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is * important. */ eavail = ep_events_available(ep); if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); write_unlock_irq(&ep->lock); if (!eavail) timed_out = !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * We were woken up, thus go and try to harvest some events. * If timed out and still on the wait queue, recheck eavail * carefully under lock, below. */ eavail = 1; if (!list_empty_careful(&wait.entry)) { write_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its * timeout expired before it could reacquire the lock. * Thus, when wait.entry is empty, it needs to harvest * events. */ if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); write_unlock_irq(&ep->lock); } } } /** * ep_loop_check_proc - verify that adding an epoll file inside another * epoll structure does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * * Return: %zero if adding the epoll @file inside current epoll * structure @ep does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { int error = 0; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) error = -1; else error = ep_loop_check_proc(ep_tovisit, depth + 1); if (error != 0) break; } else { /* * If we've reached a file that is not associated with * an ep, then we need to check if the newly added * links are going to add too many wakeup paths. We do * this by adding it to the tfile_check_list, if it's * not already there, and calling reverse_path_check() * during ep_insert(). */ list_file(epi->ffd.file); } } mutex_unlock(&ep->mtx); return error; } /** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * * Return: %zero if adding the epoll @to inside the epoll @from * does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { inserting_into = ep; return ep_loop_check_proc(to, 0); } static void clear_tfile_check_list(void) { rcu_read_lock(); while (tfile_check_list != EP_UNACTIVE_PTR) { struct epitems_head *head = tfile_check_list; tfile_check_list = head->next; unlist_file(head); } rcu_read_unlock(); } /* * Open an eventpoll file descriptor. */ static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; /* * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); return fd; out_free_fd: put_unused_fd(fd); out_free_ep: ep_free(ep); return error; } SYSCALL_DEFINE1(epoll_create1, int, flags) { return do_epoll_create(flags); } SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; return do_epoll_create(0); } static inline int epoll_mutex_lock(struct mutex *mutex, int depth, bool nonblock) { if (!nonblock) { mutex_lock_nested(mutex, depth); return 0; } if (mutex_trylock(mutex)) return 0; return -EAGAIN; } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock) { int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; error = -EBADF; f = fdget(epfd); if (!f.file) goto error_return; /* Get the "struct file *" for the target file */ tf = fdget(fd); if (!tf.file) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; if (!file_can_poll(tf.file)) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; /* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data; /* * When we insert an epoll file descriptor inside another epoll file * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the * 'epmutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epmutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; if (is_file_epoll(tf.file)) { tep = tf.file->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; } } /* * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, tf.file, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) error = ep_remove(ep, epi); else error = -ENOENT; break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_modify(ep, epi, epds); } } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); error_tgt_fput: if (full_check) { clear_tfile_check_list(); loop_check_gen++; mutex_unlock(&epmutex); } fdput(tf); error_fput: fdput(f); error_return: return error; } /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { struct epoll_event epds; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) return -EFAULT; return do_epoll_ctl(epfd, op, fd, &epds, false); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { int error; struct fd f; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ if (!access_ok(events, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* Get the "struct file *" for the eventpoll file */ f = fdget(epfd); if (!f.file) return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; if (!is_file_epoll(f.file)) goto error_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data; /* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, to); error_fput: fdput(f); return error; } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { struct timespec64 to; return do_epoll_wait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout)); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). */ static int do_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to, const sigset_t __user *sigmask, size_t sigsetsize) { int error; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ error = set_user_sigmask(sigmask, sigsetsize); if (error) return error; error = do_epoll_wait(epfd, events, maxevents, to); restore_saved_sigmask_unless(error == -EINTR); return error; } SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 to; return do_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #ifdef CONFIG_COMPAT static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { long err; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ err = set_compat_user_sigmask(sigmask, sigsetsize); if (err) return err; err = do_epoll_wait(epfd, events, maxevents, timeout); restore_saved_sigmask_unless(err == -EINTR); return err; } COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 to; return do_compat_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_compat_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #endif static int __init eventpoll_init(void) { struct sysinfo si; si_meminfo(&si); /* * Allows top 4% of lomem to be allocated for epoll watches (per user). */ max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; BUG_ON(max_user_watches < 0); /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs */ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } fs_initcall(eventpoll_init);
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1084 1083 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> enum udp_tunnel_nic_table_entry_flags { UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), }; struct udp_tunnel_nic_table_entry { __be16 port; u8 type; u8 flags; u16 use_cnt; #define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX u8 hw_priv; }; /** * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled * @n_tables: number of tables under @entries * @missed: bitmap of tables which overflown * @entries: table of tables of ports currently offloaded */ struct udp_tunnel_nic { struct work_struct work; struct net_device *dev; u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; unsigned int n_tables; unsigned long missed; struct udp_tunnel_nic_table_entry **entries; }; /* We ensure all work structs are done using driver state, but not the code. * We need a workqueue we can flush before module gets removed. */ static struct workqueue_struct *udp_tunnel_nic_workqueue; static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) { switch (type) { case UDP_TUNNEL_TYPE_VXLAN: return "vxlan"; case UDP_TUNNEL_TYPE_GENEVE: return "geneve"; case UDP_TUNNEL_TYPE_VXLAN_GPE: return "vxlan-gpe"; default: return "unknown"; } } static bool udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt == 0 && !entry->flags; } static bool udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); } static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) { if (!udp_tunnel_nic_entry_is_free(entry)) entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) { entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; } static bool udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | UDP_TUNNEL_NIC_ENTRY_DEL); } static void udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, struct udp_tunnel_nic_table_entry *entry, unsigned int flag) { entry->flags |= flag; utn->need_sync = 1; } static void udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, struct udp_tunnel_info *ti) { memset(ti, 0, sizeof(*ti)); ti->port = entry->port; ti->type = entry->type; ti->hw_priv = entry->hw_priv; } static bool udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return false; return true; } static bool udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; if (!utn->missed) return false; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!test_bit(i, &utn->missed)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return true; } return false; } static void __udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; entry = &utn->entries[table][idx]; if (entry->use_cnt) udp_tunnel_nic_ti_from_entry(entry, ti); } static void __udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; } static void udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, int err) { bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && (!err || (err == -EEXIST && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && (!err || (err == -ENOENT && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; if (!err) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; else entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; } static void udp_tunnel_nic_device_sync_one(struct net_device *dev, struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_info ti; int err; entry = &utn->entries[table][idx]; if (!udp_tunnel_nic_entry_is_queued(entry)) return; udp_tunnel_nic_ti_from_entry(entry, &ti); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); else err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, &ti); udp_tunnel_nic_entry_update_done(entry, err); if (err) netdev_warn(dev, "UDP tunnel port sync failed port %d type %s: %d\n", be16_to_cpu(entry->port), udp_tunnel_nic_tunnel_type_name(entry->type), err); } static void udp_tunnel_nic_device_sync_by_port(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_device_sync_one(dev, utn, i, j); } static void udp_tunnel_nic_device_sync_by_table(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; int err; for (i = 0; i < utn->n_tables; i++) { /* Find something that needs sync in this table */ for (j = 0; j < info->tables[i].n_entries; j++) if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) break; if (j == info->tables[i].n_entries) continue; err = info->sync_table(dev, i); if (err) netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", i, err); for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (udp_tunnel_nic_entry_is_queued(entry)) udp_tunnel_nic_entry_update_done(entry, err); } } } static void __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { if (!utn->need_sync) return; if (dev->udp_tunnel_nic_info->sync_table) udp_tunnel_nic_device_sync_by_table(dev, utn); else udp_tunnel_nic_device_sync_by_port(dev, utn); utn->need_sync = 0; /* Can't replay directly here, in case we come from the tunnel driver's * notification - trying to replay may deadlock inside tunnel driver. */ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); } static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; bool may_sleep; if (!utn->need_sync) return; /* Drivers which sleep in the callback need to update from * the workqueue, if we come from the tunnel driver's notification. */ may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (!may_sleep) __udp_tunnel_nic_device_sync(dev, utn); if (may_sleep || utn->need_replay) { queue_work(udp_tunnel_nic_workqueue, &utn->work); utn->work_pending = 1; } } static bool udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, struct udp_tunnel_info *ti) { return table->tunnel_types & ti->type; } static bool udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i; /* Special case IPv4-only NICs */ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && ti->sa_family != AF_INET) return false; for (i = 0; i < utn->n_tables; i++) if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) return true; return false; } static int udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_table_entry *entry; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry) && entry->port == ti->port && entry->type != ti->type) { __set_bit(i, &utn->missed); return true; } } return false; } static void udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; unsigned int from, to; WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX); /* If not going from used to unused or vice versa - all done. * For dodgy entries make sure we try to sync again (queue the entry). */ entry->use_cnt += use_cnt_adj; if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) return; /* Cancel the op before it was sent to the device, if possible, * otherwise we'd need to take special care to issue commands * in the same order the ports arrived. */ if (use_cnt_adj < 0) { from = UDP_TUNNEL_NIC_ENTRY_ADD; to = UDP_TUNNEL_NIC_ENTRY_DEL; } else { from = UDP_TUNNEL_NIC_ENTRY_DEL; to = UDP_TUNNEL_NIC_ENTRY_ADD; } if (entry->flags & from) { entry->flags &= ~from; if (!dodgy) return; } udp_tunnel_nic_entry_queue(utn, entry, to); } static bool udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; if (udp_tunnel_nic_entry_is_free(entry) || entry->port != ti->port || entry->type != ti->type) return false; if (udp_tunnel_nic_entry_is_frozen(entry)) return true; udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); return true; } /* Try to find existing matching entry and adjust its use count, instead of * adding a new one. Returns true if entry was found. In case of delete the * entry may have gotten removed in the process, in which case it will be * queued for removal. */ static bool udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti, int use_cnt_adj) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, use_cnt_adj)) return true; } return false; } static bool udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, +1); } static bool udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, -1); } static bool udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry)) continue; entry->port = ti->port; entry->type = ti->type; entry->use_cnt = 1; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); return true; } /* The different table may still fit this port in, but there * are no devices currently which have multiple tables accepting * the same tunnel type, and false positives are okay. */ __set_bit(i, &utn->missed); } return false; } static void __udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) return; if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && ti->port == htons(IANA_VXLAN_UDP_PORT)) { if (ti->type != UDP_TUNNEL_TYPE_VXLAN) netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); return; } if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; /* It may happen that a tunnel of one type is removed and different * tunnel type tries to reuse its port before the device was informed. * Rely on utn->missed to re-add this port later. */ if (udp_tunnel_nic_has_collision(dev, utn, ti)) return; if (!udp_tunnel_nic_add_existing(dev, utn, ti)) udp_tunnel_nic_add_new(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; udp_tunnel_nic_del_existing(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int i, j; ASSERT_RTNL(); utn = dev->udp_tunnel_nic; if (!utn) return; utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); /* We don't release rtnl across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); } __udp_tunnel_nic_device_sync(dev, utn); } static size_t __udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int j; size_t size; utn = dev->udp_tunnel_nic; if (!utn) return 0; size = 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; struct nlattr *nest; unsigned int j; utn = dev->udp_tunnel_nic; if (!utn) return 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, utn->entries[table][j].port) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(utn->entries[table][j].type))) goto err_cancel; nla_nest_end(skb, nest); } return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, }; static void udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { int adj_cnt = -utn->entries[i][j].use_cnt; if (adj_cnt) udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); } __udp_tunnel_nic_device_sync(dev, utn); for (i = 0; i < utn->n_tables; i++) memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, sizeof(**utn->entries))); WARN_ON(utn->need_sync); utn->need_replay = 0; } static void udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node; unsigned int i, j; /* Freeze all the ports we are already tracking so that the replay * does not double up the refcount. */ for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); utn->missed = 0; utn->need_replay = 0; if (!info->shared) { udp_tunnel_get_rx_info(dev); } else { list_for_each_entry(node, &info->shared->devices, list) udp_tunnel_get_rx_info(node->dev); } for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); } static void udp_tunnel_nic_device_sync_work(struct work_struct *work) { struct udp_tunnel_nic *utn = container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); rtnl_unlock(); } static struct udp_tunnel_nic * udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, unsigned int n_tables) { struct udp_tunnel_nic *utn; unsigned int i; utn = kzalloc(sizeof(*utn), GFP_KERNEL); if (!utn) return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); utn->entries = kmalloc_array(n_tables, sizeof(void *), GFP_KERNEL); if (!utn->entries) goto err_free_utn; for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, sizeof(*utn->entries[i]), GFP_KERNEL); if (!utn->entries[i]) goto err_free_prev_entries; } return utn; err_free_prev_entries: while (i--) kfree(utn->entries[i]); kfree(utn->entries); err_free_utn: kfree(utn); return NULL; } static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn) { unsigned int i; for (i = 0; i < utn->n_tables; i++) kfree(utn->entries[i]); kfree(utn->entries); kfree(utn); } static int udp_tunnel_nic_register(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node = NULL; struct udp_tunnel_nic *utn; unsigned int n_tables, i; BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < UDP_TUNNEL_NIC_MAX_TABLES); /* Expect use count of at most 2 (IPv4, IPv6) per device */ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX < UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2); /* Check that the driver info is sane */ if (WARN_ON(!info->set_port != !info->unset_port) || WARN_ON(!info->set_port == !info->sync_table) || WARN_ON(!info->tables[0].n_entries)) return -EINVAL; if (WARN_ON(info->shared && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return -EINVAL; n_tables = 1; for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) continue; n_tables++; if (WARN_ON(!info->tables[i - 1].n_entries)) return -EINVAL; } /* Create UDP tunnel state structures */ if (info->shared) { node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->dev = dev; } if (info->shared && info->shared->udp_tunnel_nic_info) { utn = info->shared->udp_tunnel_nic_info; } else { utn = udp_tunnel_nic_alloc(info, n_tables); if (!utn) { kfree(node); return -ENOMEM; } } if (info->shared) { if (!info->shared->udp_tunnel_nic_info) { INIT_LIST_HEAD(&info->shared->devices); info->shared->udp_tunnel_nic_info = utn; } list_add_tail(&node->list, &info->shared->devices); } utn->dev = dev; dev_hold(dev); dev->udp_tunnel_nic = utn; if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) udp_tunnel_get_rx_info(dev); return 0; } static void udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ if (info->shared) { struct udp_tunnel_nic_shared_node *node, *first; list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; if (list_entry_is_head(node, &info->shared->devices, list)) return; list_del(&node->list); kfree(node); first = list_first_entry_or_null(&info->shared->devices, typeof(*first), list); if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; goto release_dev; } info->shared->udp_tunnel_nic_info = NULL; } /* Flush before we check work, so we don't waste time adding entries * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. */ if (utn->work_pending) return; udp_tunnel_nic_free(utn); release_dev: dev->udp_tunnel_nic = NULL; dev_put(dev); } static int udp_tunnel_nic_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); const struct udp_tunnel_nic_info *info; struct udp_tunnel_nic *utn; info = dev->udp_tunnel_nic_info; if (!info) return NOTIFY_DONE; if (event == NETDEV_REGISTER) { int err; err = udp_tunnel_nic_register(dev); if (err) netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ utn = dev->udp_tunnel_nic; if (!utn) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { udp_tunnel_nic_unregister(dev, utn); return NOTIFY_OK; } /* All other events only matter if NIC has to be programmed open */ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return NOTIFY_DONE; if (event == NETDEV_UP) { WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { udp_tunnel_nic_flush(dev, utn); return NOTIFY_OK; } return NOTIFY_DONE; } static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { .notifier_call = udp_tunnel_nic_netdevice_event, }; static int __init udp_tunnel_nic_init_module(void) { int err; udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0); if (!udp_tunnel_nic_workqueue) return -ENOMEM; rtnl_lock(); udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; rtnl_unlock(); err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); if (err) goto err_unset_ops; return 0; err_unset_ops: rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); return err; } late_initcall(udp_tunnel_nic_init_module); static void __exit udp_tunnel_nic_cleanup_module(void) { unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); } module_exit(udp_tunnel_nic_cleanup_module); MODULE_LICENSE("GPL");
1072 200 200 189 18 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> #include <linux/tracehook.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/sysfs.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/mmu_context.h> /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit * overflowing 32-bit refcounts or the ldsem writer count. */ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL static struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, .maxlen = sizeof(oops_limit), .mode = 0644, .proc_handler = proc_douintvec, }, { } }; static __init int kernel_exit_sysctls_init(void) { register_sysctl_init("kernel", kern_exit_table); return 0; } late_initcall(kernel_exit_sysctls_init); #endif static atomic_t oops_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); } static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); static __init int kernel_exit_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); return 0; } late_initcall(kernel_exit_sysfs_init); #endif static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_TGID); detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) posix_cpu_timers_exit_group(tsk); #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exit_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); } } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void put_task_struct_rcu_user(struct task_struct *task) { if (refcount_dec_and_test(&task->rcu_users)) call_rcu(&task->rcu, delayed_put_task_struct); } void release_task(struct task_struct *p) { struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); thread_pid = get_pid(p->thread_pid); __exit_signal(p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } int rcuwait_wake_up(struct rcuwait *w) { int ret = 0; struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ task = rcu_dereference(w->task); if (task) ret = wake_up_process(task); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } #ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *c, *g, *p = current; retry: /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { WRITE_ONCE(mm->owner, NULL); return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(c, &p->children, sibling) { if (c->mm == mm) goto assign_new_owner; } /* * Search in the siblings */ list_for_each_entry(c, &p->real_parent->children, sibling) { if (c->mm == mm) goto assign_new_owner; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { if (c->mm == mm) goto assign_new_owner; if (c->mm) break; } } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); /* * Delay read_unlock() till we have the task_lock() * to ensure that c does not slip away underneath us */ read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); put_task_struct(c); goto retry; } WRITE_ONCE(mm->owner, c); task_unlock(c); put_task_struct(c); } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; struct core_state *core_state; exit_mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); /* * Serialize with any possible pending coredump. * We must hold mmap_lock around checking core_state * and clearing tsk->mm. The core-inducing thread * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ mmap_read_lock(mm); core_state = mm->core_state; if (core_state) { struct core_thread self; mmap_read_unlock(mm); self.task = current; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; freezable_schedule(); } __set_current_state(TASK_RUNNING); mmap_read_lock(mm); } mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); /* * When a thread stops operating on an address space, the loop * in membarrier_private_expedited() may not observe that * tsk->mm, and the loop in membarrier_global_expedited() may * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED * rq->membarrier_state, so those would not issue an IPI. * Membarrier requires a memory barrier after accessing * user-space memory, before clearing tsk->mm or the * rq->membarrier_state. */ smp_mb__after_spinlock(); local_irq_disable(); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); local_irq_enable(); task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * This does two things: * * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { RCU_INIT_POINTER(t->real_parent, reaper); BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; } if (autoreap) { tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else static inline void check_stack_usage(void) {} #endif void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; /* * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); /* * If do_exit is called because this processes oopsed, it's possible * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before * continuing. Amongst other possible reasons, this is to prevent * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ force_uaccess_begin(); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } profile_task_exit(tsk); kcov_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); /* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk); sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); void __noreturn make_task_dead(int signr) { /* * Take the task off the cpu after something catastrophic has * happened. */ unsigned int limit; /* * Every time the system oopses, if the oops happens while a reference * to an object was held, the reference leaks. * If the oops doesn't also leak memory, repeated oopsing can cause * reference counters to wrap around (if they're not using refcount_t). * This means that repeated oopsing can make unexploitable-looking bugs * exploitable through repeated oopsing. * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ limit = READ_ONCE(oops_limit); if (atomic_inc_return(&oops_count) >= limit && limit) panic("Oopsed too often (kernel.oops_limit is %d)", limit); do_exit(signr); } void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); do_exit(code); } EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; BUG_ON(exit_code & 0x80); /* core dumps don't get here */ if (signal_group_exit(sig)) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } struct waitid_info { pid_t pid; uid_t uid; int status; int cause; }; struct wait_opts { enum pid_type wo_type; int wo_flags; struct pid *wo_pid; struct waitid_info *wo_info; int wo_stat; struct rusage *wo_rusage; wait_queue_entry_t child_wait; int notask_error; }; static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-theads * which can reap other children at the same time. Until * we change k_getrusage()-like users to rely on this lock * we have to take ->siglock as well. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&current->sighand->siglock); write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock(&psig->stats_lock); spin_unlock_irq(&current->sighand->siglock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (!eligible_pid(wo, p)) return 0; if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) return 0; return default_wake_function(wait, mode, sync, key); } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, p); } static bool is_effectively_child(struct wait_opts *wo, bool ptrace, struct task_struct *target) { struct task_struct *parent = !ptrace ? target->real_parent : target->parent; return current == parent || (!(wo->wo_flags & __WNOTHREAD) && same_thread_group(current, parent)); } /* * Optimization for waiting on PIDTYPE_PID. No need to iterate through child * and tracee lists to find the target task. */ static int do_wait_pid(struct wait_opts *wo) { bool ptrace; struct task_struct *target; int retval; ptrace = false; target = pid_task(wo->wo_pid, PIDTYPE_TGID); if (target && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } ptrace = true; target = pid_task(wo->wo_pid, PIDTYPE_PID); if (target && target->ptrace && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } return 0; } static long do_wait(struct wait_opts *wo) { int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); repeat: /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) goto end; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) goto end; retval = ptrace_do_wait(wo, tsk); if (retval) goto end; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) { retval = -ERESTARTSYS; if (!signal_pending(current)) { schedule(); goto repeat; } } end: __set_current_state(TASK_RUNNING); remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); return retval; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; unsigned int f_flags = 0; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid < 0) return -EINVAL; if (upid) pid = find_get_pid(upid); else pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options; wo.wo_info = infop; wo.wo_rusage = ru; if (f_flags & O_NONBLOCK) wo.wo_flags |= WNOHANG; ret = do_wait(&wo); if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) ret = -EAGAIN; put_pid(pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } int kernel_wait(pid_t pid, int *stat) { struct wait_opts wo = { .wo_type = PIDTYPE_PID, .wo_pid = find_get_pid(pid), .wo_flags = WEXITED, }; int ret; ret = do_wait(&wo); if (ret > 0 && wo.wo_stat) *stat = wo.wo_stat; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } #endif /** * thread_group_exited - check that a thread group has exited * @pid: tgid of thread group to be checked. * * Test if the thread group represented by tgid has exited (all * threads are zombies, dead or completely gone). * * Return: true if the thread group has exited. false otherwise. */ bool thread_group_exited(struct pid *pid) { struct task_struct *task; bool exited; rcu_read_lock(); task = pid_task(pid, PIDTYPE_PID); exited = !task || (READ_ONCE(task->exit_state) && thread_group_empty(task)); rcu_read_unlock(); return exited; } EXPORT_SYMBOL(thread_group_exited); __weak void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort);
244 244 244 244 241 241 130 241 221 241 7 241 241 241 221 241 241 241 241 241 241 221 241 241 241 241 248 248 8 241 241 241 241 248 248 248 8 241 8 9 9 9 9 233 213 24 235 234 3 248 187 86 244 246 60 247 112 254 254 254 254 254 254 3 251 254 115 248 8 240 3 251 251 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/secretmem.h> #include <linux/sched/signal.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> #include <linux/migrate.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include "internal.h" struct follow_page_context { struct dev_pagemap *pgmap; unsigned int page_mask; }; static void hpage_pincount_add(struct page *page, int refs) { VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); VM_BUG_ON_PAGE(page != compound_head(page), page); atomic_add(refs, compound_pincount_ptr(page)); } static void hpage_pincount_sub(struct page *page, int refs) { VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); VM_BUG_ON_PAGE(page != compound_head(page), page); atomic_sub(refs, compound_pincount_ptr(page)); } /* Equivalent to calling put_page() @refs times. */ static void put_page_refs(struct page *page, int refs) { #ifdef CONFIG_DEBUG_VM if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page)) return; #endif /* * Calling put_page() for each ref is unnecessarily slow. Only the last * ref needs a put_page(). */ if (refs > 1) page_ref_sub(page, refs - 1); put_page(page); } /* * Return the compound head page with ref appropriately incremented, * or NULL if that failed. */ static inline struct page *try_get_compound_head(struct page *page, int refs) { struct page *head = compound_head(page); if (WARN_ON_ONCE(page_ref_count(head) < 0)) return NULL; if (unlikely(!page_cache_add_speculative(head, refs))) return NULL; /* * At this point we have a stable reference to the head page; but it * could be that between the compound_head() lookup and the refcount * increment, the compound page was split, in which case we'd end up * holding a reference on a page that has nothing to do with the page * we were given anymore. * So now that the head page is stable, recheck that the pages still * belong together. */ if (unlikely(compound_head(page) != head)) { put_page_refs(head, refs); return NULL; } return head; } /** * try_grab_compound_head() - attempt to elevate a page's refcount, by a * flags-dependent amount. * * Even though the name includes "compound_head", this function is still * appropriate for callers that have a non-compound @page to get. * * @page: pointer to page to be grabbed * @refs: the value to (effectively) add to the page's refcount * @flags: gup flags: these are the FOLL_* flag values. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * * FOLL_GET: page's refcount will be incremented by @refs. * * FOLL_PIN on compound pages that are > two pages long: page's refcount will * be incremented by @refs, and page[2].hpage_pinned_refcount will be * incremented by @refs * GUP_PIN_COUNTING_BIAS. * * FOLL_PIN on normal pages, or compound pages that are two pages long: * page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS. * * Return: head page (with refcount appropriately incremented) for success, or * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ __maybe_unused struct page *try_grab_compound_head(struct page *page, int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); else if (flags & FOLL_PIN) { /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow * path. */ if (unlikely((flags & FOLL_LONGTERM) && !is_pinnable_page(page))) return NULL; /* * CAUTION: Don't use compound_head() on the page before this * point, the result won't be stable. */ page = try_get_compound_head(page, refs); if (!page) return NULL; /* * When pinning a compound page of order > 1 (which is what * hpage_pincount_available() checks for), use an exact count to * track it, via hpage_pincount_add/_sub(). * * However, be sure to *also* increment the normal page refcount * field at least once, so that the page really is pinned. * That's why the refcount from the earlier * try_get_compound_head() is left intact. */ if (hpage_pincount_available(page)) hpage_pincount_add(page, refs); else page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, refs); return page; } WARN_ON_ONCE(1); return NULL; } static void put_compound_head(struct page *page, int refs, unsigned int flags) { if (flags & FOLL_PIN) { mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, refs); if (hpage_pincount_available(page)) hpage_pincount_sub(page, refs); else refs *= GUP_PIN_COUNTING_BIAS; } put_page_refs(page, refs); } /** * try_grab_page() - elevate a page's refcount by a flag-dependent amount * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. * * @page: pointer to page to be grabbed * @flags: gup flags: these are the FOLL_* flag values. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same * time. Cases: please see the try_grab_compound_head() documentation, with * "refs=1". * * Return: true for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or * FOLL_PIN was set, but the page could not be grabbed. */ bool __must_check try_grab_page(struct page *page, unsigned int flags) { WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); if (flags & FOLL_GET) return try_get_page(page); else if (flags & FOLL_PIN) { int refs = 1; page = compound_head(page); if (WARN_ON_ONCE(page_ref_count(page) <= 0)) return false; if (hpage_pincount_available(page)) hpage_pincount_add(page, 1); else refs = GUP_PIN_COUNTING_BIAS; /* * Similar to try_grab_compound_head(): even if using the * hpage_pincount_add/_sub() routines, be sure to * *also* increment the normal page refcount field at least * once, so that the page really is pinned. */ page_ref_add(page, refs); mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1); } return true; } /** * unpin_user_page() - release a dma-pinned page * @page: pointer to page to be released * * Pages that were pinned via pin_user_pages*() must be released via either * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so * that such pages can be separately tracked and uniquely handled. In * particular, interactions with RDMA and filesystems need special handling. */ void unpin_user_page(struct page *page) { put_compound_head(compound_head(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page); static inline void compound_range_next(unsigned long i, unsigned long npages, struct page **list, struct page **head, unsigned int *ntails) { struct page *next, *page; unsigned int nr = 1; if (i >= npages) return; next = *list + i; page = compound_head(next); if (PageCompound(page) && compound_order(page) >= 1) nr = min_t(unsigned int, page + compound_nr(page) - next, npages - i); *head = page; *ntails = nr; } #define for_each_compound_range(__i, __list, __npages, __head, __ntails) \ for (__i = 0, \ compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \ __i < __npages; __i += __ntails, \ compound_range_next(__i, __npages, __list, &(__head), &(__ntails))) static inline void compound_next(unsigned long i, unsigned long npages, struct page **list, struct page **head, unsigned int *ntails) { struct page *page; unsigned int nr; if (i >= npages) return; page = compound_head(list[i]); for (nr = i + 1; nr < npages; nr++) { if (compound_head(list[nr]) != page) break; } *head = page; *ntails = nr - i; } #define for_each_compound_head(__i, __list, __npages, __head, __ntails) \ for (__i = 0, \ compound_next(__i, __npages, __list, &(__head), &(__ntails)); \ __i < __npages; __i += __ntails, \ compound_next(__i, __npages, __list, &(__head), &(__ntails))) /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page" refers to a page that has had one of the get_user_pages() * variants called on that page. * * For each page in the @pages array, make that page (or its head page, if a * compound page) dirty, if @make_dirty is true, and if the page was previously * listed as clean. In any case, releases all pages using unpin_user_page(), * possibly via unpin_user_pages(), for the non-dirty case. * * Please see the unpin_user_page() documentation for details. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long index; struct page *head; unsigned int ntails; if (!make_dirty) { unpin_user_pages(pages, npages); return; } for_each_compound_head(index, pages, npages, head, ntails) { /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key * cases: * * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called * page_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes * on to call TestClearPageDirty(), and write the page * back. * * 2) This code sees the page as clean, so it calls * set_page_dirty(). The page stays dirty, despite being * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ if (!PageDirty(head)) set_page_dirty_lock(head); put_compound_head(head, ntails, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages_dirty_lock); /** * unpin_user_page_range_dirty_lock() - release and optionally dirty * gup-pinned page range * * @page: the starting page of a range maybe marked dirty, and definitely released. * @npages: number of consecutive pages to release. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page range" refers to a range of pages that has had one of the * pin_user_pages() variants called on that page. * * For the page ranges defined by [page .. page+npages], make that range (or * its head pages, if a compound page) dirty, if @make_dirty is true, and if the * page range was previously listed as clean. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { unsigned long index; struct page *head; unsigned int ntails; for_each_compound_range(index, &page, npages, head, ntails) { if (make_dirty && !PageDirty(head)) set_page_dirty_lock(head); put_compound_head(head, ntails, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * * For each page in the @pages array, release the page using unpin_user_page(). * * Please see the unpin_user_page() documentation for details. */ void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long index; struct page *head; unsigned int ntails; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by * leaving them pinned), but probably not. More likely, gup/pup returned * a hard -ERRNO error to the caller, who erroneously passed it here. */ if (WARN_ON(IS_ERR_VALUE(npages))) return; for_each_compound_head(index, pages, npages, head, ntails) put_compound_head(head, ntails, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_pages); /* * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) { if (!test_bit(MMF_HAS_PINNED, mm_flags)) set_bit(MMF_HAS_PINNED, mm_flags); } #ifdef CONFIG_MMU static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { /* * When core dumping an enormous anonymous area that nobody * has touched so far, we don't want to allocate unnecessary pages or * page tables. Return error instead of NULL to skip handle_mm_fault, * then get_dump_page() will return NULL to leave a hole in the dump. * But we can only make this optimization where a hole would surely * be zero-filled if handle_mm_fault() actually did handle it. */ if ((flags & FOLL_DUMP) && (vma_is_anonymous(vma) || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); return NULL; } static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { /* No page to get reference */ if (flags & (FOLL_GET | FOLL_PIN)) return -EFAULT; if (flags & FOLL_TOUCH) { pte_t entry = *pte; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); if (!pte_same(*pte, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } } /* Proper page table entry exists, but no corresponding struct page */ return -EEXIST; } /* * FOLL_FORCE can write to even unwritable pte's, but only * after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { return pte_write(pte) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; int ret; /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); /* * Considering PTE level hugetlb, like continuous-PTE hugetlb on * ARM64 architecture. */ if (is_vm_hugetlb_page(vma)) { page = follow_huge_pmd_pte(vma, address, flags); if (page) return page; return no_page_table(vma, flags); } retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; /* * KSM's break_ksm() relies upon recognizing a ksm page * even while it is being migrated, so for that case we * need migration_entry_wait(). */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; if (pte_none(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); goto retry; } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap * reference. */ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); if (*pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ if (unlikely(!try_grab_page(page, flags))) { page = ERR_PTR(-ENOMEM); goto out; } /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Do not mlock pte-mapped THP */ if (PageTransCompound(page)) goto out; /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */ if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* * Because we lock page here, and migration is * blocked by the pte's page reference, and we * know the page is still mapped, we don't even * need to check for file-cache page truncation. */ mlock_vma_page(page); unlock_page(page); } } out: pte_unmap_unlock(ptep, ptl); return page; no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags); } static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, struct follow_page_context *ctx) { pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); /* * The READ_ONCE() will stabilize the pmdval in a register or * on the stack so that it will stop changing under the code. */ pmdval = READ_ONCE(*pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { page = follow_huge_pmd_pte(vma, address, flags); if (page) return page; return no_page_table(vma, flags); } if (is_hugepd(__hugepd(pmd_val(pmdval)))) { page = follow_huge_pd(vma, address, __hugepd(pmd_val(pmdval)), flags, PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); } retry: if (!pmd_present(pmdval)) { if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(pmdval)); if (is_pmd_migration_entry(pmdval)) pmd_migration_entry_wait(mm, pmd); pmdval = READ_ONCE(*pmd); /* * MADV_DONTNEED may convert the pmd to null because * mmap_lock is held in read mode */ if (pmd_none(pmdval)) return no_page_table(vma, flags); goto retry; } if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; } if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); retry_locked: ptl = pmd_lock(mm, pmd); if (unlikely(pmd_none(*pmd))) { spin_unlock(ptl); return no_page_table(vma, flags); } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); pmd_migration_entry_wait(mm, pmd); goto retry_locked; } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (flags & FOLL_SPLIT_PMD) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); ret = 0; split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; } else { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; } return ret ? ERR_PTR(ret) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); ctx->page_mask = HPAGE_PMD_NR - 1; return page; } static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, unsigned int flags, struct follow_page_context *ctx) { pud_t *pud; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pud = pud_offset(p4dp, address); if (pud_none(*pud)) return no_page_table(vma, flags); if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { page = follow_huge_pud(mm, address, pud, flags); if (page) return page; return no_page_table(vma, flags); } if (is_hugepd(__hugepd(pud_val(*pud)))) { page = follow_huge_pd(vma, address, __hugepd(pud_val(*pud)), flags, PUD_SHIFT); if (page) return page; return no_page_table(vma, flags); } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); return follow_pmd_mask(vma, address, pud, flags, ctx); } static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, unsigned int flags, struct follow_page_context *ctx) { p4d_t *p4d; struct page *page; p4d = p4d_offset(pgdp, address); if (p4d_none(*p4d)) return no_page_table(vma, flags); BUILD_BUG_ON(p4d_huge(*p4d)); if (unlikely(p4d_bad(*p4d))) return no_page_table(vma, flags); if (is_hugepd(__hugepd(p4d_val(*p4d)))) { page = follow_huge_pd(vma, address, __hugepd(p4d_val(*p4d)), flags, P4D_SHIFT); if (page) return page; return no_page_table(vma, flags); } return follow_pud_mask(vma, address, p4d, flags, ctx); } /** * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a * pointer to output page_mask * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches * the device's dev_pagemap metadata to avoid repeating expensive lookups. * * On output, the @ctx->page_mask is set according to the size of the page. * * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { pgd_t *pgd; struct page *page; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; /* make this handle hugepd */ page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN)); return page; } pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); if (pgd_huge(*pgd)) { page = follow_huge_pgd(mm, address, pgd, flags); if (page) return page; return no_page_table(vma, flags); } if (is_hugepd(__hugepd(pgd_val(*pgd)))) { page = follow_huge_pd(vma, address, __hugepd(pgd_val(*pgd)), flags, PGDIR_SHIFT); if (page) return page; return no_page_table(vma, flags); } return follow_p4d_mask(vma, address, pgd, flags, ctx); } struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { struct follow_page_context ctx = { NULL }; struct page *page; if (vma_is_secretmem(vma)) return NULL; page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; int ret = -EFAULT; /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; if (address > TASK_SIZE) pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return -EFAULT; pud = pud_offset(p4d, address); if (pud_none(*pud)) return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, address); if (pte_none(*pte)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; *page = vm_normal_page(*vma, address, *pte); if (!*page) { if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); } if (unlikely(!try_grab_page(*page, gup_flags))) { ret = -ENOMEM; goto unmap; } out: ret = 0; unmap: pte_unmap(pte); return ret; } /* * mmap_lock must be held on entry. If @locked != NULL and *@flags * does not include FOLL_NOWAIT, the mmap_lock may be released. If it * is, *@locked will be set to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) return -ENOENT; if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; if (locked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist */ fault_flags |= FAULT_FLAG_TRIED; } ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; } /* * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when * necessary, even if maybe_mkwrite decided not to set pte_write. We * can thus safely do subsequent page lookups as if they were reads. * But only do so when looping for pte_write is futile: in some cases * userspace may also be wanting to write to the gotten user page, * which a read fault here might prevent (a readonly page might get * reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) *flags |= FOLL_COW; return 0; } static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; int write = (gup_flags & FOLL_WRITE); int foreign = (gup_flags & FOLL_REMOTE); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) return -EFAULT; if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; if (vma_is_secretmem(vma)) return -EFAULT; if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could * set a breakpoint in a read-only mapping of an * executable, without corrupting the file (yet only * when that file had been opened for writing!). * Anon pages in shared mappings are surprising: now * just reject it. */ if (!is_cow_mapping(vm_flags)) return -EFAULT; } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * Is there actually any vma we can reach here which does not * have VM_MAYREAD set? */ if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } /* * gups are always data accesses, not instruction * fetches, so execute=false here */ if (!arch_vma_access_permitted(vma, write, false, foreign)) return -EFAULT; return 0; } /** * __get_user_pages() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * @locked: whether we're still with the mmap_lock held * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * -- 0 return value is possible when the fault would need to be retried. * * The caller is responsible for releasing returned @pages, via put_page(). * * @vmas are valid only as long as mmap_lock is held. * * Must be called with mmap_lock held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If * the page is written to, set_page_dirty (or set_page_dirty_lock, as * appropriate) must be called after the page is finished with, and * before put_page is called. * * If @locked != NULL, *@locked will be set to 0 when mmap_lock is * released by an up_read(). That can happen if @gup_flags does not * have FOLL_NOWAIT. * * A caller using such a combination of @locked and @gup_flags * must therefore hold the mmap_lock for reading only, and recognize * when it's been released. Otherwise, it must be held for either * reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; start = untagged_addr(start); VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); /* * If FOLL_FORCE is set then do not force a full fault as the hinting * fault information is unrelated to the reference behaviour of a task * using the address space */ if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA; do { struct page *page; unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &pages[i] : NULL); if (ret) goto out; ctx.page_mask = 0; goto next_page; } if (!vma) { ret = -EFAULT; goto out; } ret = check_vma_flags(vma, gup_flags); if (ret) goto out; if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, gup_flags, locked); if (locked && *locked == 0) { /* * We've got a VM_FAULT_RETRY * and we've lost mmap_lock. * We must stop here. */ BUG_ON(gup_flags & FOLL_NOWAIT); goto out; } continue; } } retry: /* * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page) { ret = faultin_page(vma, start, &foll_flags, locked); switch (ret) { case 0: goto retry; case -EBUSY: ret = 0; fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: goto out; case -ENOENT: goto next_page; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding * struct page. */ goto next_page; } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); ctx.page_mask = 0; } next_page: if (vmas) { vmas[i] = vma; ctx.page_mask = 0; } page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return i ? i : ret; } static bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; if (!(vm_flags & vma->vm_flags)) return false; /* * The architecture might have a hardware protection * mechanism other than read/write that can deny access. * * gup always represents data access, not instruction * fetches, so execute=false here: */ if (!arch_vma_access_permitted(vma, write, false, foreign)) return false; return true; } /** * fixup_user_fault() - manually resolve a user page fault * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller * does not allow retry. If NULL, the caller must guarantee * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() * section), this returns -EFAULT, and we want to resolve the user fault before * trying again. * * Typically this is meant to be used by the futex code. * * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * * This function will not return with an unlocked mmap_lock. So it has not the * same semantics wrt the @mm->mmap_lock as does filemap_fault(). */ int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { struct vm_area_struct *vma; vm_fault_t ret; address = untagged_addr(address); if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current)) return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; } return 0; } EXPORT_SYMBOL_GPL(fixup_user_fault); /* * Please note that this function, unlike __get_user_pages will not * return 0 for nr_pages > 0 without FOLL_NOWAIT */ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, int *locked, unsigned int flags) { long ret, pages_done; bool lock_dropped; if (locked) { /* if VM_FAULT_RETRY can be returned, vmas become invalid */ BUG_ON(vmas); /* check caller initialized locked */ BUG_ON(*locked != 1); } if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has * carelessly failed to specify FOLL_GET), so keep doing that, but only * for FOLL_GET, not for the newer FOLL_PIN. * * FOLL_PIN always expects pages to be non-null, but no need to assert * that here, as any failures will be obvious enough. */ if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; lock_dropped = false; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, vmas, locked); if (!locked) /* VM_FAULT_RETRY couldn't trigger, bypass */ return ret; /* VM_FAULT_RETRY cannot return errors */ if (!*locked) { BUG_ON(ret < 0); BUG_ON(ret >= nr_pages); } if (ret > 0) { nr_pages -= ret; pages_done += ret; if (!nr_pages) break; } if (*locked) { /* * VM_FAULT_RETRY didn't trigger or it was a * FOLL_NOWAIT. */ if (!pages_done) pages_done = ret; break; } /* * VM_FAULT_RETRY triggered, so seek to the faulting offset. * For the prefault case (!pages) we only update counts. */ if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; lock_dropped = true; retry: /* * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted * by fatal signals, so we need to check it before we * start trying again otherwise it can loop forever. */ if (fatal_signal_pending(current)) { if (!pages_done) pages_done = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) { BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; } *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, NULL, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); goto retry; } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) pages_done = ret; break; } nr_pages--; pages_done++; if (!nr_pages) break; if (likely(pages)) pages++; start += PAGE_SIZE; } if (lock_dropped && *locked) { /* * We must let the caller know we temporarily dropped the lock * and so the critical section protected by it was lost. */ mmap_read_unlock(mm); *locked = 0; } return pages_done; } /** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address * @locked: whether the mmap_lock is still held * * This takes care of mlocking the pages too if VM_LOCKED is set. * * Return either number of pages pinned in the vma, or a negative error * code on error. * * vma->vm_mm->mmap_lock must be held. * * If @locked is NULL, it may be held for read or write and will * be unperturbed. * * If @locked is non-NULL, it must held for read only and may be * released. If it's released, *@locked will be set to 0. */ long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; if (vma->vm_flags & VM_LOCKONFAULT) gup_flags &= ~FOLL_POPULATE; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW * and we would not want to dirty them for nothing. */ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; /* * We want mlock to succeed for regions that have any permissions * other than PROT_NONE. */ if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ return __get_user_pages(mm, start, nr_pages, gup_flags, NULL, NULL, locked); } /* * faultin_vma_page_range() - populate (prefault) page tables inside the * given VMA range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * * @vma: target vma * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * * Returns either number of processed pages in the vma, or a negative error * code on error (see __get_user_pages()). * * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and * covered by the VMA. * * If @locked is NULL, it may be held for read or write and will be unperturbed. * * If @locked is non-NULL, it must held for read only and may be released. If * it's released, *@locked will be set to 0. */ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* * FOLL_TOUCH: Mark page accessed and thereby young; will also mark * the page dirty with FOLL_WRITE -- which doesn't make a * difference with !FOLL_FORCE, because the page is writable * in the page table. * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit * a poisoned page. * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT. * !FOLL_FORCE: Require proper access permissions. */ gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON; if (write) gup_flags |= FOLL_WRITE; /* * We want to report -EINVAL instead of -EFAULT for any permission * problems or incompatible mappings. */ if (check_vma_flags(vma, gup_flags)) return -EINVAL; return __get_user_pages(mm, start, nr_pages, gup_flags, NULL, NULL, locked); } /* * __mm_populate - populate and/or mlock pages within a range of address space. * * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap * flags. VMAs must be already marked with the desired vm_flags, and * mmap_lock must not be held. */ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; end = start + len; for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ if (!locked) { locked = 1; mmap_read_lock(mm); vma = find_vma(mm, nstart); } else if (nstart >= vma->vm_end) vma = vma->vm_next; if (!vma || vma->vm_start >= end) break; /* * Set [nstart; nend) to intersection of desired address * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; /* * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ } break; } nend = nstart + ret * PAGE_SIZE; ret = 0; } if (locked) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ } #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, int *locked, unsigned int foll_flags) { struct vm_area_struct *vma; unsigned long vm_flags; long i; /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. */ vm_flags = (foll_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (foll_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) goto finish_or_fault; /* protect what we can, including chardevs */ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) goto finish_or_fault; if (pages) { pages[i] = virt_to_page(start); if (pages[i]) get_page(pages[i]); } if (vmas) vmas[i] = vma; start = (start + PAGE_SIZE) & PAGE_MASK; } return i; finish_or_fault: return i ? : -EFAULT; } #endif /* !CONFIG_MMU */ /** * fault_in_writeable - fault in userspace address range for writing * @uaddr: start of address range * @size: size of address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_writeable(char __user *uaddr, size_t size) { char __user *start = uaddr, *end; if (unlikely(size == 0)) return 0; if (!PAGE_ALIGNED(uaddr)) { if (unlikely(__put_user(0, uaddr) != 0)) return size; uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { if (unlikely(__put_user(0, uaddr) != 0)) goto out; uaddr += PAGE_SIZE; } out: if (size > uaddr - start) return size - (uaddr - start); return 0; } EXPORT_SYMBOL(fault_in_writeable); /* * fault_in_safe_writeable - fault in an address range for writing * @uaddr: start of address range * @size: length of address range * * Faults in an address range for writing. This is primarily useful when we * already know that some or all of the pages in the address range aren't in * memory. * * Unlike fault_in_writeable(), this function is non-destructive. * * Note that we don't pin or otherwise hold the pages referenced that we fault * in. There's no guarantee that they'll stay in memory for any duration of * time. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { unsigned long start = (unsigned long)uaddr, end; struct mm_struct *mm = current->mm; bool unlocked = false; if (unlikely(size == 0)) return 0; end = PAGE_ALIGN(start + size); if (end < start) end = 0; mmap_read_lock(mm); do { if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) break; start = (start + PAGE_SIZE) & PAGE_MASK; } while (start != end); mmap_read_unlock(mm); if (size > (unsigned long)uaddr - start) return size - ((unsigned long)uaddr - start); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); /** * fault_in_readable - fault in userspace address range for reading * @uaddr: start of user address range * @size: size of user address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_readable(const char __user *uaddr, size_t size) { const char __user *start = uaddr, *end; volatile char c; if (unlikely(size == 0)) return 0; if (!PAGE_ALIGNED(uaddr)) { if (unlikely(__get_user(c, uaddr) != 0)) return size; uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { if (unlikely(__get_user(c, uaddr) != 0)) goto out; uaddr += PAGE_SIZE; } out: (void)c; if (size > uaddr - start) return size - (uaddr - start); return 0; } EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). * * Returns NULL on any kind of failure - a hole must then be inserted into * the corefile, to preserve alignment with its headers; and also returns * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - * allowing a hole to be left in the corefile to save disk space. * * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { struct mm_struct *mm = current->mm; struct page *page; int locked = 1; int ret; if (mmap_read_lock_killable(mm)) return NULL; ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); if (locked) mmap_read_unlock(mm); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ #ifdef CONFIG_MIGRATION /* * Check whether all pages are pinnable, if so return number of pages. If some * pages are not pinnable, migrate them, and unpin all pages. Return zero if * pages were migrated, or if some pages were not successfully isolated. * Return negative error if migration fails. */ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { unsigned long i; unsigned long isolation_error_count = 0; bool drain_allow = true; LIST_HEAD(movable_page_list); long ret = 0; struct page *prev_head = NULL; struct page *head; struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; for (i = 0; i < nr_pages; i++) { head = compound_head(pages[i]); if (head == prev_head) continue; prev_head = head; /* * If we get a movable page, since we are going to be pinning * these entries, try to move them out if possible. */ if (!is_pinnable_page(head)) { if (PageHuge(head)) { if (isolate_hugetlb(head, &movable_page_list)) isolation_error_count++; } else { if (!PageLRU(head) && drain_allow) { lru_add_drain_all(); drain_allow = false; } if (isolate_lru_page(head)) { isolation_error_count++; continue; } list_add_tail(&head->lru, &movable_page_list); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_lru(head), thp_nr_pages(head)); } } } /* * If list is empty, and no isolation errors, means that all pages are * in the correct zone. */ if (list_empty(&movable_page_list) && !isolation_error_count) return nr_pages; if (gup_flags & FOLL_PIN) { unpin_user_pages(pages, nr_pages); } else { for (i = 0; i < nr_pages; i++) put_page(pages[i]); } if (!list_empty(&movable_page_list)) { ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN, NULL); if (ret && !list_empty(&movable_page_list)) putback_movable_pages(&movable_page_list); } return ret > 0 ? -ENOMEM : ret; } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { return nr_pages; } #endif /* CONFIG_MIGRATION */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which * allows us to process the FOLL_LONGTERM flag. */ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, unsigned int gup_flags) { unsigned int flags; long rc; if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); flags = memalloc_pin_save(); do { rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); if (rc <= 0) break; rc = check_and_migrate_movable_pages(rc, pages, gup_flags); } while (!rc); memalloc_pin_restore(flags); return rc; } static bool is_valid_gup_flags(unsigned int gup_flags) { /* * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, * never directly by the caller, so enforce that with an assertion: */ if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return false; /* * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying * that is, FOLL_LONGTERM is a specific case, more restrictive case of * FOLL_PIN. */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return false; return true; } #ifdef CONFIG_MMU static long __get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { /* * Parts of FOLL_LONGTERM behavior are incompatible with * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on * vmas. However, this only comes up if locked is set, and there are * callers that do request FOLL_LONGTERM, but do not set locked. So, * allow what we can. */ if (gup_flags & FOLL_LONGTERM) { if (WARN_ON_ONCE(locked)) return -EINVAL; /* * This will check the vmas (even if our vmas arg is NULL) * and return -ENOTSUPP if DAX isn't allowed in this case: */ return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } /** * get_user_pages_remote() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * * The caller is responsible for releasing returned @pages, via put_page(). * * @vmas are valid only as long as mmap_lock is held. * * Must be called with mmap_lock held for read or write. * * get_user_pages_remote walks a process's page tables and takes a reference * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * * get_user_pages_remote is typically used for fewer-copy IO operations, * to get a handle on the memory by some means other than accesses * via the user virtual addresses. The pages may be submitted for * DMA to devices or accessed via their kernel linear mapping (via the * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { if (!is_valid_gup_flags(gup_flags)) return -EINVAL; return __get_user_pages_remote(mm, start, nr_pages, gup_flags, pages, vmas, locked); } EXPORT_SYMBOL(get_user_pages_remote); #else /* CONFIG_MMU */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { return 0; } static long __get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { return 0; } #endif /* !CONFIG_MMU */ /** * get_user_pages() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * * This is the same as get_user_pages_remote(), just with a less-flexible * calling convention where we assume that the mm being operated on belongs to * the current task, and doesn't allow passing of a locked parameter. We also * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { if (!is_valid_gup_flags(gup_flags)) return -EINVAL; return __gup_longterm_locked(current->mm, start, nr_pages, pages, vmas, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); /** * get_user_pages_locked() - variant of get_user_pages() * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * It is suitable to replace the form: * * mmap_read_lock(mm); * do_something() * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * to: * * int locked = 1; * mmap_read_lock(mm); * do_something() * get_user_pages_locked(mm, ..., pages, &locked); * if (locked) * mmap_read_unlock(mm); * * We can leverage the VM_FAULT_RETRY functionality in the page fault * paths better by using either get_user_pages_locked() or * get_user_pages_unlocked(). * */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { /* * FIXME: Current FOLL_LONGTERM behavior is incompatible with * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on * vmas. As there are no users of this flag in this call we simply * disallow this option for now. */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; /* * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, * never directly by the caller, so enforce that: */ if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_locked); /* * get_user_pages_unlocked() is suitable to replace the form: * * mmap_read_lock(mm); * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * with: * * get_user_pages_unlocked(mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so * get_user_pages_fast should be used instead if specific gup_flags * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { struct mm_struct *mm = current->mm; int locked = 1; long ret; /* * FIXME: Current FOLL_LONGTERM behavior is incompatible with * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on * vmas. As there are no users of this flag in this call we simply * disallow this option for now. */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; mmap_read_lock(mm); ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL, &locked, gup_flags | FOLL_TOUCH); if (locked) mmap_read_unlock(mm); return ret; } EXPORT_SYMBOL(get_user_pages_unlocked); /* * Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be * protected from page table pages being freed from under it, and should * block any THP splits. * * One way to achieve this is to have the walker disable interrupts, and * rely on IPIs from the TLB flushing code blocking before the page table * pages are freed. This is unsuitable for architectures that do not need * to broadcast an IPI when invalidating TLBs. * * Another way to achieve this is to batch up page table containing pages * belonging to more than one mm_user, then rcu_sched a callback to free those * pages. Disabling interrupts will allow the fast_gup walker to both block * the rcu_sched callback, and an IPI that we broadcast for splitting THPs * (which is a relatively rare event). The code below adopts this strategy. * * Before activating this code, please be aware that the following assumptions * are currently made: * * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * * *) access_ok is sufficient to validate userspace address ranges. * * The last two assumptions can be relaxed by the addition of helper functions. * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_FAST_GUP static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; ClearPageReferenced(page); if (flags & FOLL_PIN) unpin_user_page(page); else put_page(page); } } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * Fast-gup relies on pte change detection to avoid concurrent pgtable * operations. * * To pin the page, fast-gup needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy * with fast-gup, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * * For THP collapse, it's a bit more complicated because fast-gup may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); do { pte_t pte = ptep_get_lockless(ptep); struct page *head, *page; /* * Similar to the PMD case below, NUMA hinting must take slow * path using the pte_protnone check. */ if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { if (unlikely(flags & FOLL_LONGTERM)) goto pte_unmap; pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); head = try_grab_compound_head(page, 1, flags); if (!head) goto pte_unmap; if (unlikely(page_is_secretmem(page))) { put_compound_head(head, 1, flags); goto pte_unmap; } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(*ptep))) { put_compound_head(head, 1, flags); goto pte_unmap; } VM_BUG_ON_PAGE(compound_head(page) != head, page); /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please * see Documentation/core-api/pin_user_pages.rst for * details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { unpin_user_page(page); goto pte_unmap; } } SetPageReferenced(page); pages[*nr] = page; (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); ret = 1; pte_unmap: if (pgmap) put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } #else /* * If we can't determine whether or not a pte is special, then fail immediately * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; int ret = 1; do { struct page *page = pfn_to_page(pfn); pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); ret = 0; break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); ret = 0; break; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); return ret; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } #endif static int record_subpages(struct page *page, unsigned long addr, unsigned long end, struct page **pages) { int nr; for (nr = 0; addr != end; addr += PAGE_SIZE) pages[nr++] = page++; return nr; } #ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { unsigned long __boundary = (addr + sz) & ~(sz-1); return (__boundary - 1 < end - 1) ? __boundary : end; } static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long pte_end; struct page *head, *page; pte_t pte; int refs; pte_end = (addr + sz) & ~(sz-1); if (pte_end < end) end = pte_end; pte = huge_ptep_get(ptep); if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); head = pte_page(pte); page = head + ((addr & (sz-1)) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); head = try_grab_compound_head(head, refs, flags); if (!head) return 0; if (unlikely(pte_val(pte) != pte_val(*ptep))) { put_compound_head(head, refs, flags); return 0; } *nr += refs; SetPageReferenced(head); return 1; } static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { pte_t *ptep; unsigned long sz = 1UL << hugepd_shift(hugepd); unsigned long next; ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) return 0; } while (ptep++, addr = next, addr != end); return 1; } #else static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_HUGEPD */ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, pages, nr); } page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); head = try_grab_compound_head(pmd_page(orig), refs, flags); if (!head) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { put_compound_head(head, refs, flags); return 0; } *nr += refs; SetPageReferenced(head); return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pud(orig, pudp, addr, end, flags, pages, nr); } page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); head = try_grab_compound_head(pud_page(orig), refs, flags); if (!head) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { put_compound_head(head, refs, flags); return 0; } *nr += refs; SetPageReferenced(head); return 1; } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *head, *page; if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); head = try_grab_compound_head(pgd_page(orig), refs, flags); if (!head) return 0; if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { put_compound_head(head, refs, flags); return 0; } *nr += refs; SetPageReferenced(head); return 1; } static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { /* * architecture have different format for hugetlbfs * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, PUD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); if (p4d_none(p4d)) return 0; BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } static void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; pgdp = pgd_offset(current->mm, addr); do { pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, flags, pages, nr)) return; } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else static inline void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { } #endif /* CONFIG_HAVE_FAST_GUP */ #ifndef gup_fast_permitted /* * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) { return true; } #endif static int __gup_longterm_unlocked(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { int ret; /* * FIXME: FOLL_LONGTERM does not work with * get_user_pages_unlocked() (see comments in that function) */ if (gup_flags & FOLL_LONGTERM) { mmap_read_lock(current->mm); ret = __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, gup_flags); mmap_read_unlock(current->mm); } else { ret = get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } return ret; } static unsigned long lockless_pages_from_mm(unsigned long start, unsigned long end, unsigned int gup_flags, struct page **pages) { unsigned long flags; int nr_pinned = 0; unsigned seq; if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || !gup_fast_permitted(start, end)) return 0; if (gup_flags & FOLL_PIN) { seq = raw_read_seqcount(&current->mm->write_protect_seq); if (seq & 1) return 0; } /* * Disable interrupts. The nested form is used, in order to allow full, * general purpose use of this routine. * * With interrupts disabled, we block page table pages from being freed * from under us. See struct mmu_table_batch comments in * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock() here as we also want to block IPIs * that come from THPs splitting. */ local_irq_save(flags); gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); /* * When pinning pages for DMA there could be a concurrent write protect * from fork() via copy_page_range(), in this case always fail fast GUP. */ if (gup_flags & FOLL_PIN) { if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) { unpin_user_pages(pages, nr_pinned); return 0; } } return nr_pinned; } static int internal_get_user_pages_fast(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return 0; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, pages); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so * returning -errno is not an option */ if (nr_pinned) return nr_pinned; return ret; } return ret + nr_pinned; } /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * Note a difference with get_user_pages_fast: this always returns the * number of pages pinned, 0 if no pages were pinned. * * If the architecture does not support this function, simply return with no * pages pinned. * * Careful, careful! COW breaking can go either way, so a non-write * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { int nr_pinned; /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. * * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ gup_flags |= FOLL_GET | FOLL_FAST_ONLY; nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); /* * As specified in the API description above, this routine is not * allowed to return negative values. However, the common core * routine internal_get_user_pages_fast() *can* return -errno. * Therefore, correct for that here: */ if (nr_pinned < 0) nr_pinned = 0; return nr_pinned; } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number requested. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns * -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { if (!is_valid_gup_flags(gup_flags)) return -EINVAL; /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ gup_flags |= FOLL_GET; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See * get_user_pages_fast() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; gup_flags |= FOLL_PIN; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); /* * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. * * The API rules are the same, too: no negative values may be returned. */ int pin_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { int nr_pinned; /* * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API * rules require returning 0, rather than -errno: */ if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return 0; /* * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY); nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); /* * This routine is not allowed to return negative values. However, * internal_get_user_pages_fast() *can* return -errno. Therefore, * correct for that here: */ if (nr_pinned < 0) nr_pinned = 0; return nr_pinned; } EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); /** * pin_user_pages_remote() - pin pages of a remote process * * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See * get_user_pages_remote() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; gup_flags |= FOLL_PIN; return __get_user_pages_remote(mm, start, nr_pages, gup_flags, pages, vmas, locked); } EXPORT_SYMBOL(pin_user_pages_remote); /** * pin_user_pages() - pin user pages in memory for use by other devices * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; gup_flags |= FOLL_PIN; return __gup_longterm_locked(current->mm, start, nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; gup_flags |= FOLL_PIN; return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); /* * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). * Behavior is the same, except that this one sets FOLL_PIN and rejects * FOLL_GET. */ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { /* * FIXME: Current FOLL_LONGTERM behavior is incompatible with * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on * vmas. As there are no users of this flag in this call we simply * disallow this option for now. */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; gup_flags |= FOLL_PIN; return __get_user_pages_locked(current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(pin_user_pages_locked);
1084 1083 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; struct crypto_shash *siw_crypto_shash; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->netdev->dev_addr); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_create_tx_threads(void) { int cpu, assigned = 0; for_each_online_cpu(cpu) { /* Skip HT cores */ if (cpu % cpumask_weight(topology_sibling_cpumask(cpu))) continue; siw_tx_thread[cpu] = kthread_create(siw_run_sq, (unsigned long *)(long)cpu, "siw_tx/%d", cpu); if (IS_ERR(siw_tx_thread[cpu])) { siw_tx_thread[cpu] = NULL; continue; } kthread_bind(siw_tx_thread[cpu], cpu); wake_up_process(siw_tx_thread[cpu]); assigned++; } return assigned; } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; while (--i >= 0) kfree(siw_cpu_info.tx_valid_cpus[i]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; return -ENOMEM; } static void siw_destroy_cpulist(void) { int i = 0; while (i < siw_cpu_info.num_nodes) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; sdev->netdev = netdev; if (netdev->type != ARPHRD_LOOPBACK) { addrconf_addr_eui48((unsigned char *)&base_dev->node_guid, netdev->dev_addr); } else { /* * The loopback device does not have a HW address, * but connection mangagement lib expects gid != 0 */ size_t len = min_t(size_t, strlen(base_dev->name), 6); char addr[6] = { }; memcpy(addr, base_dev->name, len); addrconf_addr_eui48((unsigned char *)&base_dev->node_guid, addr); } base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } /* * Network link becomes unavailable. Mark all * affected QP's accordingly. */ static void siw_netdev_down(struct work_struct *work) { struct siw_device *sdev = container_of(work, struct siw_device, netdev_down); struct siw_qp_attrs qp_attrs; struct list_head *pos, *tmp; memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.state = SIW_QP_STATE_ERROR; list_for_each_safe(pos, tmp, &sdev->qp_list) { struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); down_write(&qp->state_lock); WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); up_write(&qp->state_lock); } ib_device_put(&sdev->base_dev); } static void siw_device_goes_down(struct siw_device *sdev) { if (ib_device_try_get(&sdev->base_dev)) { INIT_WORK(&sdev->netdev_down, siw_netdev_down); schedule_work(&sdev->netdev_down); } } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); if (dev_net(netdev) != &init_net) return NOTIFY_OK; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_UP: sdev->state = IB_PORT_ACTIVE; siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; case NETDEV_GOING_DOWN: siw_device_goes_down(sdev); break; case NETDEV_DOWN: sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); break; case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * Todo: Below netdev events are currently not handled. */ case NETDEV_CHANGEMTU: case NETDEV_CHANGE: break; default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); if (netif_running(netdev) && netif_carrier_ok(netdev)) sdev->state = IB_PORT_ACTIVE; else sdev->state = IB_PORT_DOWN; rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; int nr_cpu; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } /* * Locate CRC32 algorithm. If unsuccessful, fail * loading siw only, if CRC is required. */ siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(siw_crypto_shash)) { pr_info("siw: Loading CRC32c failed: %ld\n", PTR_ERR(siw_crypto_shash)); siw_crypto_shash = NULL; if (mpa_crc_required) { rv = -EOPNOTSUPP; goto out_error; } } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) { if (siw_tx_thread[nr_cpu]) { siw_stop_tx_thread(nr_cpu); siw_tx_thread[nr_cpu] = NULL; } } if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(); return rv; } static void __exit siw_exit_module(void) { int cpu; for_each_possible_cpu(cpu) { if (siw_tx_thread[cpu]) { siw_stop_tx_thread(cpu); siw_tx_thread[cpu] = NULL; } } unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw");
2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/in.h> #include <linux/export.h> #include <linux/time.h> #include <linux/rds.h> #include "rds.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, struct in6_addr *saddr) { refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; inc->i_saddr = *saddr; inc->i_usercopy.rdma_cookie = 0; inc->i_usercopy.rx_tstamp = ktime_set(0, 0); memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace)); } EXPORT_SYMBOL_GPL(rds_inc_init); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, struct in6_addr *saddr) { refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = cp->cp_conn; inc->i_conn_path = cp; inc->i_saddr = *saddr; inc->i_usercopy.rdma_cookie = 0; inc->i_usercopy.rx_tstamp = ktime_set(0, 0); } EXPORT_SYMBOL_GPL(rds_inc_path_init); static void rds_inc_addref(struct rds_incoming *inc) { rdsdebug("addref inc %p ref %d\n", inc, refcount_read(&inc->i_refcount)); refcount_inc(&inc->i_refcount); } void rds_inc_put(struct rds_incoming *inc) { rdsdebug("put inc %p ref %d\n", inc, refcount_read(&inc->i_refcount)); if (refcount_dec_and_test(&inc->i_refcount)) { BUG_ON(!list_empty(&inc->i_item)); inc->i_conn->c_trans->inc_free(inc); } } EXPORT_SYMBOL_GPL(rds_inc_put); static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, struct rds_cong_map *map, int delta, __be16 port) { int now_congested; if (delta == 0) return; rs->rs_rcv_bytes += delta; if (delta > 0) rds_stats_add(s_recv_bytes_added_to_socket, delta); else rds_stats_add(s_recv_bytes_removed_from_socket, -delta); /* loop transport doesn't send/recv congestion updates */ if (rs->rs_transport->t_type == RDS_TRANS_LOOP) return; now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d " "now_cong %d delta %d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, rds_sk_rcvbuf(rs), now_congested, delta); /* wasn't -> am congested */ if (!rs->rs_congested && now_congested) { rs->rs_congested = 1; rds_cong_set_bit(map, port); rds_cong_queue_updates(map); } /* was -> aren't congested */ /* Require more free space before reporting uncongested to prevent bouncing cong/uncong state too often */ else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) { rs->rs_congested = 0; rds_cong_clear_bit(map, port); rds_cong_queue_updates(map); } /* do nothing if no change in cong state */ } static void rds_conn_peer_gen_update(struct rds_connection *conn, u32 peer_gen_num) { int i; struct rds_message *rm, *tmp; unsigned long flags; WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP); if (peer_gen_num != 0) { if (conn->c_peer_gen_num != 0 && peer_gen_num != conn->c_peer_gen_num) { for (i = 0; i < RDS_MPATH_WORKERS; i++) { struct rds_conn_path *cp; cp = &conn->c_path[i]; spin_lock_irqsave(&cp->cp_lock, flags); cp->cp_next_tx_seq = 1; cp->cp_next_rx_seq = 0; list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { set_bit(RDS_MSG_FLUSH, &rm->m_flags); } spin_unlock_irqrestore(&cp->cp_lock, flags); } } conn->c_peer_gen_num = peer_gen_num; } } /* * Process all extension headers that come with this message. */ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs) { struct rds_header *hdr = &inc->i_hdr; unsigned int pos = 0, type, len; union { struct rds_ext_header_version version; struct rds_ext_header_rdma rdma; struct rds_ext_header_rdma_dest rdma_dest; } buffer; while (1) { len = sizeof(buffer); type = rds_message_next_extension(hdr, &pos, &buffer, &len); if (type == RDS_EXTHDR_NONE) break; /* Process extension header here */ switch (type) { case RDS_EXTHDR_RDMA: rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0); break; case RDS_EXTHDR_RDMA_DEST: /* We ignore the size for now. We could stash it * somewhere and use it for error checking. */ inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie( be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); break; } } } static void rds_recv_hs_exthdrs(struct rds_header *hdr, struct rds_connection *conn) { unsigned int pos = 0, type, len; union { struct rds_ext_header_version version; u16 rds_npaths; u32 rds_gen_num; } buffer; u32 new_peer_gen_num = 0; while (1) { len = sizeof(buffer); type = rds_message_next_extension(hdr, &pos, &buffer, &len); if (type == RDS_EXTHDR_NONE) break; /* Process extension header here */ switch (type) { case RDS_EXTHDR_NPATHS: conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, be16_to_cpu(buffer.rds_npaths)); break; case RDS_EXTHDR_GEN_NUM: new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num); break; default: pr_warn_ratelimited("ignoring unknown exthdr type " "0x%x\n", type); } } /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ conn->c_npaths = max_t(int, conn->c_npaths, 1); conn->c_ping_triggered = 0; rds_conn_peer_gen_update(conn, new_peer_gen_num); } /* rds_start_mprds() will synchronously start multiple paths when appropriate. * The scheme is based on the following rules: * * 1. rds_sendmsg on first connect attempt sends the probe ping, with the * sender's npaths (s_npaths) * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It * sends back a probe-pong with r_npaths. After that, if rcvr is the * smaller ip addr, it starts rds_conn_path_connect_if_down on all * mprds_paths. * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down. * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be * called after reception of the probe-pong on all mprds_paths. * Otherwise (sender of probe-ping is not the smaller ip addr): just call * rds_conn_path_connect_if_down on the hashed path. (see rule 4) * 4. rds_connect_worker must only trigger a connection if laddr < faddr. * 5. sender may end up queuing the packet on the cp. will get sent out later. * when connection is completed. */ static void rds_start_mprds(struct rds_connection *conn) { int i; struct rds_conn_path *cp; if (conn->c_npaths > 1 && rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) { for (i = 0; i < conn->c_npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_connect_if_down(cp); } } } /* * The transport must make sure that this is serialized against other * rx and conn reset on this specific conn. * * We currently assert that only one fragmented message will be sent * down a connection at a time. This lets us reassemble in the conn * instead of per-flow which means that we don't have to go digging through * flows to tear down partial reassembly progress on conn failure and * we save flow lookup and locking for each frag arrival. It does mean * that small messages will wait behind large ones. Fragmenting at all * is only to reduce the memory consumption of pre-posted buffers. * * The caller passes in saddr and daddr instead of us getting it from the * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message. */ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp) { struct rds_sock *rs = NULL; struct sock *sk; unsigned long flags; struct rds_conn_path *cp; inc->i_conn = conn; inc->i_rx_jiffies = jiffies; if (conn->c_trans->t_mp_capable) cp = inc->i_conn_path; else cp = &conn->c_path[0]; rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " "flags 0x%x rx_jiffies %lu\n", conn, (unsigned long long)cp->cp_next_rx_seq, inc, (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), be32_to_cpu(inc->i_hdr.h_len), be16_to_cpu(inc->i_hdr.h_sport), be16_to_cpu(inc->i_hdr.h_dport), inc->i_hdr.h_flags, inc->i_rx_jiffies); /* * Sequence numbers should only increase. Messages get their * sequence number as they're queued in a sending conn. They * can be dropped, though, if the sending socket is closed before * they hit the wire. So sequence numbers can skip forward * under normal operation. They can also drop back in the conn * failover case as previously sent messages are resent down the * new instance of a conn. We drop those, otherwise we have * to assume that the next valid seq does not come after a * hole in the fragment stream. * * The headers don't give us a way to realize if fragments of * a message have been dropped. We assume that frags that arrive * to a flow are part of the current message on the flow that is * being reassembled. This means that senders can't drop messages * from the sending conn until all their frags are sent. * * XXX we could spend more on the wire to get more robust failure * detection, arguably worth it to avoid data corruption. */ if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { rds_stats_inc(s_recv_drop_old_seq); goto out; } cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (inc->i_hdr.h_sport == 0) { rdsdebug("ignore ping with 0 sport from %pI6c\n", saddr); goto out; } rds_stats_inc(s_recv_ping); rds_send_pong(cp, inc->i_hdr.h_sport); /* if this is a handshake ping, start multipath if necessary */ if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport), be16_to_cpu(inc->i_hdr.h_dport))) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); rds_start_mprds(cp->cp_conn); } goto out; } if (be16_to_cpu(inc->i_hdr.h_dport) == RDS_FLAG_PROBE_PORT && inc->i_hdr.h_sport == 0) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); /* if this is a handshake pong, start multipath if necessary */ rds_start_mprds(cp->cp_conn); wake_up(&cp->cp_conn->c_hs_waitq); goto out; } rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; } /* Process extension headers */ rds_recv_incoming_exthdrs(inc, rs); /* We can be racing with rds_release() which marks the socket dead. */ sk = rds_rs_to_sk(rs); /* serialize with rds_release -> sock_orphan */ write_lock_irqsave(&rs->rs_recv_lock, flags); if (!sock_flag(sk, SOCK_DEAD)) { rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); rds_stats_inc(s_recv_queued); rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); if (sock_flag(sk, SOCK_RCVTSTAMP)) inc->i_usercopy.rx_tstamp = ktime_get_real(); rds_inc_addref(inc); inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); } else { rds_stats_inc(s_recv_drop_dead_sock); } write_unlock_irqrestore(&rs->rs_recv_lock, flags); out: if (rs) rds_sock_put(rs); } EXPORT_SYMBOL_GPL(rds_recv_incoming); /* * be very careful here. This is being called as the condition in * wait_event_*() needs to cope with being called many times. */ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) { unsigned long flags; if (!*inc) { read_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&rs->rs_recv_queue)) { *inc = list_entry(rs->rs_recv_queue.next, struct rds_incoming, i_item); rds_inc_addref(*inc); } read_unlock_irqrestore(&rs->rs_recv_lock, flags); } return *inc != NULL; } static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, int drop) { struct sock *sk = rds_rs_to_sk(rs); int ret = 0; unsigned long flags; write_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&inc->i_item)) { ret = 1; if (drop) { /* XXX make sure this i_conn is reliable */ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); list_del_init(&inc->i_item); rds_inc_put(inc); } } write_unlock_irqrestore(&rs->rs_recv_lock, flags); rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); return ret; } /* * Pull errors off the error queue. * If msghdr is NULL, we will just purge the error queue. */ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) { struct rds_notifier *notifier; struct rds_rdma_notify cmsg; unsigned int count = 0, max_messages = ~0U; unsigned long flags; LIST_HEAD(copy); int err = 0; memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */ /* put_cmsg copies to user space and thus may sleep. We can't do this * with rs_lock held, so first grab as many notifications as we can stuff * in the user provided cmsg buffer. We don't try to copy more, to avoid * losing notifications - except when the buffer is so small that it wouldn't * even hold a single notification. Then we give him as much of this single * msg as we can squeeze in, and set MSG_CTRUNC. */ if (msghdr) { max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg)); if (!max_messages) max_messages = 1; } spin_lock_irqsave(&rs->rs_lock, flags); while (!list_empty(&rs->rs_notify_queue) && count < max_messages) { notifier = list_entry(rs->rs_notify_queue.next, struct rds_notifier, n_list); list_move(&notifier->n_list, &copy); count++; } spin_unlock_irqrestore(&rs->rs_lock, flags); if (!count) return 0; while (!list_empty(&copy)) { notifier = list_entry(copy.next, struct rds_notifier, n_list); if (msghdr) { cmsg.user_token = notifier->n_user_token; cmsg.status = notifier->n_status; err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, sizeof(cmsg), &cmsg); if (err) break; } list_del_init(&notifier->n_list); kfree(notifier); } /* If we bailed out because of an error in put_cmsg, * we may be left with one or more notifications that we * didn't process. Return them to the head of the list. */ if (!list_empty(&copy)) { spin_lock_irqsave(&rs->rs_lock, flags); list_splice(&copy, &rs->rs_notify_queue); spin_unlock_irqrestore(&rs->rs_lock, flags); } return err; } /* * Queue a congestion notification */ static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) { uint64_t notify = rs->rs_cong_notify; unsigned long flags; int err; err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, sizeof(notify), &notify); if (err) return err; spin_lock_irqsave(&rs->rs_lock, flags); rs->rs_cong_notify &= ~notify; spin_unlock_irqrestore(&rs->rs_lock, flags); return 0; } /* * Receive any control messages. */ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, struct rds_sock *rs) { int ret = 0; if (inc->i_usercopy.rdma_cookie) { ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, sizeof(inc->i_usercopy.rdma_cookie), &inc->i_usercopy.rdma_cookie); if (ret) goto out; } if ((inc->i_usercopy.rx_tstamp != 0) && sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { struct __kernel_old_timeval tv = ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp); if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) { ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } else { struct __kernel_sock_timeval sk_tv; sk_tv.tv_sec = tv.tv_sec; sk_tv.tv_usec = tv.tv_usec; ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(sk_tv), &sk_tv); } if (ret) goto out; } if (rs->rs_rx_traces) { struct rds_cmsg_rx_trace t; int i, j; memset(&t, 0, sizeof(t)); inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock(); t.rx_traces = rs->rs_rx_traces; for (i = 0; i < rs->rs_rx_traces; i++) { j = rs->rs_rx_trace[i]; t.rx_trace_pos[i] = j; t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] - inc->i_rx_lat_trace[j]; } ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY, sizeof(t), &t); if (ret) goto out; } out: return ret; } static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) { struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue; struct rds_msg_zcopy_info *info = NULL; struct rds_zcopy_cookies *done; unsigned long flags; if (!msg->msg_control) return false; if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || msg->msg_controllen < CMSG_SPACE(sizeof(*done))) return false; spin_lock_irqsave(&q->lock, flags); if (!list_empty(&q->zcookie_head)) { info = list_entry(q->zcookie_head.next, struct rds_msg_zcopy_info, rs_zcookie_next); list_del(&info->rs_zcookie_next); } spin_unlock_irqrestore(&q->lock, flags); if (!info) return false; done = &info->zcookies; if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done), done)) { spin_lock_irqsave(&q->lock, flags); list_add(&info->rs_zcookie_next, &q->zcookie_head); spin_unlock_irqrestore(&q->lock, flags); return false; } kfree(info); return true; } int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); long timeo; int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct rds_incoming *inc = NULL; /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ timeo = sock_rcvtimeo(sk, nonblock); rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); if (msg_flags & MSG_OOB) goto out; if (msg_flags & MSG_ERRQUEUE) return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR); while (1) { /* If there are pending notifications, do those - and nothing else */ if (!list_empty(&rs->rs_notify_queue)) { ret = rds_notify_queue_get(rs, msg); break; } if (rs->rs_cong_notify) { ret = rds_notify_cong(rs, msg); break; } if (!rds_next_incoming(rs, &inc)) { if (nonblock) { bool reaped = rds_recvmsg_zcookie(rs, msg); ret = reaped ? 0 : -EAGAIN; break; } timeo = wait_event_interruptible_timeout(*sk_sleep(sk), (!list_empty(&rs->rs_notify_queue) || rs->rs_cong_notify || rds_next_incoming(rs, &inc)), timeo); rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, timeo); if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) continue; ret = timeo; if (ret == 0) ret = -ETIMEDOUT; break; } rdsdebug("copying inc %p from %pI6c:%u to user\n", inc, &inc->i_conn->c_faddr, ntohs(inc->i_hdr.h_sport)); ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); if (ret < 0) break; /* * if the message we just copied isn't at the head of the * recv queue then someone else raced us to return it, try * to get the next message. */ if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { rds_inc_put(inc); inc = NULL; rds_stats_inc(s_recv_deliver_raced); iov_iter_revert(&msg->msg_iter, ret); continue; } if (ret < be32_to_cpu(inc->i_hdr.h_len)) { if (msg_flags & MSG_TRUNC) ret = be32_to_cpu(inc->i_hdr.h_len); msg->msg_flags |= MSG_TRUNC; } if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; break; } rds_recvmsg_zcookie(rs, msg); rds_stats_inc(s_recv_delivered); if (msg->msg_name) { if (ipv6_addr_v4mapped(&inc->i_saddr)) { sin->sin_family = AF_INET; sin->sin_port = inc->i_hdr.h_sport; sin->sin_addr.s_addr = inc->i_saddr.s6_addr32[3]; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); msg->msg_namelen = sizeof(*sin); } else { sin6->sin6_family = AF_INET6; sin6->sin6_port = inc->i_hdr.h_sport; sin6->sin6_addr = inc->i_saddr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = rs->rs_bound_scope_id; msg->msg_namelen = sizeof(*sin6); } } break; } if (inc) rds_inc_put(inc); out: return ret; } /* * The socket is being shut down and we're asked to drop messages that were * queued for recvmsg. The caller has unbound the socket so the receive path * won't queue any more incoming fragments or messages on the socket. */ void rds_clear_recv_queue(struct rds_sock *rs) { struct sock *sk = rds_rs_to_sk(rs); struct rds_incoming *inc, *tmp; unsigned long flags; write_lock_irqsave(&rs->rs_recv_lock, flags); list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); list_del_init(&inc->i_item); rds_inc_put(inc); } write_unlock_irqrestore(&rs->rs_recv_lock, flags); } /* * inc->i_saddr isn't used here because it is only set in the receive * path. */ void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip) { struct rds_info_message minfo; minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo.len = be32_to_cpu(inc->i_hdr.h_len); minfo.tos = inc->i_conn->c_tos; if (flip) { minfo.laddr = daddr; minfo.faddr = saddr; minfo.lport = inc->i_hdr.h_dport; minfo.fport = inc->i_hdr.h_sport; } else { minfo.laddr = saddr; minfo.faddr = daddr; minfo.lport = inc->i_hdr.h_sport; minfo.fport = inc->i_hdr.h_dport; } minfo.flags = 0; rds_info_copy(iter, &minfo, sizeof(minfo)); } #if IS_ENABLED(CONFIG_IPV6) void rds6_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, struct in6_addr *saddr, struct in6_addr *daddr, int flip) { struct rds6_info_message minfo6; minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo6.len = be32_to_cpu(inc->i_hdr.h_len); minfo6.tos = inc->i_conn->c_tos; if (flip) { minfo6.laddr = *daddr; minfo6.faddr = *saddr; minfo6.lport = inc->i_hdr.h_dport; minfo6.fport = inc->i_hdr.h_sport; } else { minfo6.laddr = *saddr; minfo6.faddr = *daddr; minfo6.lport = inc->i_hdr.h_sport; minfo6.fport = inc->i_hdr.h_dport; } minfo6.flags = 0; rds_info_copy(iter, &minfo6, sizeof(minfo6)); } #endif
333 132 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _DELAYED_CALL_H #define _DELAYED_CALL_H /* * Poor man's closures; I wish we could've done them sanely polymorphic, * but... */ struct delayed_call { void (*fn)(void *); void *arg; }; #define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL} /* I really wish we had closures with sane typechecking... */ static inline void set_delayed_call(struct delayed_call *call, void (*fn)(void *), void *arg) { call->fn = fn; call->arg = arg; } static inline void do_delayed_call(struct delayed_call *call) { if (call->fn) call->fn(call->arg); } static inline void clear_delayed_call(struct delayed_call *call) { call->fn = NULL; } #endif
1085 1084 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if_arp.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/llc.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/openvswitch.h> #include <linux/export.h> #include <net/ip_tunnels.h> #include <net/rtnetlink.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ static void netdev_port_receive(struct sk_buff *skb) { struct vport *vport; vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; if (unlikely(skb_warn_if_lro(skb))) goto error; /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; if (skb->dev->type == ARPHRD_ETHER) skb_push_rcsum(skb, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: kfree_skb(skb); } /* Called with rcu_read_lock and bottom-halves disabled. */ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } static struct net_device *get_dpdev(const struct datapath *dp) { struct vport *local; local = ovs_vport_ovsl(dp, OVSP_LOCAL); return local->dev; } struct vport *ovs_netdev_link(struct vport *vport, const char *name) { int err; vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); if (!vport->dev) { err = -ENODEV; goto error_free_vport; } if (vport->dev->flags & IFF_LOOPBACK || (vport->dev->type != ARPHRD_ETHER && vport->dev->type != ARPHRD_NONE) || ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); if (err) goto error_unlock; err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; dev_disable_lro(vport->dev); dev_set_promiscuity(vport->dev, 1); vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: dev_put(vport->dev); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; return ovs_netdev_link(vport, parms->name); } static void vport_netdev_free(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); if (vport->dev) dev_put(vport->dev); ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { ASSERT_RTNL(); vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(vport->dev); netdev_upper_dev_unlink(vport->dev, netdev_master_upper_dev_get(vport->dev)); dev_set_promiscuity(vport->dev, -1); } static void netdev_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } void ovs_netdev_tunnel_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); /* We can be invoked by both explicit vport deletion and * underlying netdev deregistration; delete the link only * if it's not already shutting down. */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev); dev_put(vport->dev); vport->dev = NULL; rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { if (likely(netif_is_ovs_port(dev))) return (struct vport *) rcu_dereference_rtnl(dev->rx_handler_data); else return NULL; } static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, .send = dev_queue_xmit, }; int __init ovs_netdev_init(void) { return ovs_vport_ops_register(&ovs_netdev_vport_ops); } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); }
728 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_KSM_H #define __LINUX_KSM_H /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork(). */ #include <linux/bitops.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/sched.h> #include <linux/sched/coredump.h> struct stable_node; struct mem_cgroup; #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) return __ksm_enter(mm); return 0; } static inline void ksm_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) __ksm_exit(mm); } /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, * it might be faulted into a different anon_vma (or perhaps to a different * offset in the same anon_vma). do_swap_page() cannot do all the locking * needed to reconstitute a cross-anon_vma KSM page: for now it has to make * a copy, and leave remerging the pages to a later pass of ksmd. * * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); #else /* !CONFIG_KSM */ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { return 0; } static inline void ksm_exit(struct mm_struct *mm) { } #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { return 0; } static inline struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { return page; } static inline void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { } static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) { } #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ #endif /* __LINUX_KSM_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kref.h - library routines for handling generic reference counted objects * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Corp. * * based on kobject.h which was: * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org> * Copyright (C) 2002-2003 Open Source Development Labs */ #ifndef _KREF_H_ #define _KREF_H_ #include <linux/spinlock.h> #include <linux/refcount.h> struct kref { refcount_t refcount; }; #define KREF_INIT(n) { .refcount = REFCOUNT_INIT(n), } /** * kref_init - initialize object. * @kref: object in question. */ static inline void kref_init(struct kref *kref) { refcount_set(&kref->refcount, 1); } static inline unsigned int kref_read(const struct kref *kref) { return refcount_read(&kref->refcount); } /** * kref_get - increment refcount for object. * @kref: object. */ static inline void kref_get(struct kref *kref) { refcount_inc(&kref->refcount); } /** * kref_put - decrement refcount for object. * @kref: object. * @release: pointer to the function that will clean up the object when the * last reference to the object is released. * This pointer is required, and it is not acceptable to pass kfree * in as this function. * * Decrement the refcount, and if 0, call release(). * Return 1 if the object was removed, otherwise return 0. Beware, if this * function returns 0, you still can not count on the kref from remaining in * memory. Only use the return value if you want to see if the kref is now * gone, not present. */ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { if (refcount_dec_and_test(&kref->refcount)) { release(kref); return 1; } return 0; } static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *lock) { if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) { release(kref); return 1; } return 0; } static inline int kref_put_lock(struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) { if (refcount_dec_and_lock(&kref->refcount, lock)) { release(kref); return 1; } return 0; } /** * kref_get_unless_zero - Increment refcount for object unless it is zero. * @kref: object. * * Return non-zero if the increment succeeded. Otherwise return 0. * * This function is intended to simplify locking around refcounting for * objects that can be looked up from a lookup structure, and which are * removed from that lookup structure in the object destructor. * Operations on such objects require at least a read lock around * lookup + kref_get, and a write lock around kref_put + remove from lookup * structure. Furthermore, RCU implementations become extremely tricky. * With a lookup followed by a kref_get_unless_zero *with return value check* * locking in the kref_put path can be deferred to the actual removal from * the lookup structure and RCU lookups become trivial. */ static inline int __must_check kref_get_unless_zero(struct kref *kref) { return refcount_inc_not_zero(&kref->refcount); } #endif /* _KREF_H_ */
3120 517 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_INTERNAL_H #define _LINUX_HIGHMEM_INTERNAL_H /* * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); void kunmap_local_indexed(void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); void __kmap_local_sched_in(void); static inline void kmap_assert_nomap(void) { DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); } #else static inline void kmap_local_fork(struct task_struct *tsk) { } static inline void kmap_assert_nomap(void) { } #endif #ifdef CONFIG_HIGHMEM #include <asm/highmem.h> #ifndef ARCH_HAS_KMAP_FLUSH_TLB static inline void kmap_flush_tlb(unsigned long addr) { } #endif #ifndef kmap_prot #define kmap_prot PAGE_KERNEL #endif void *kmap_high(struct page *page); void kunmap_high(struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); static inline void *kmap(struct page *page) { void *addr; might_sleep(); if (!PageHighMem(page)) addr = page_address(page); else addr = kmap_high(page); kmap_flush_tlb((unsigned long)addr); return addr; } static inline void kunmap(struct page *page) { might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); } static inline struct page *kmap_to_page(void *addr) { return __kmap_to_page(addr); } static inline void kmap_flush_unused(void) { __kmap_flush_unused(); } static inline void *kmap_local_page(struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } static inline void *kmap_local_pfn(unsigned long pfn) { return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_local(void *vaddr) { kunmap_local_indexed(vaddr); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_page_prot(page, prot); } static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } static inline void *kmap_atomic_pfn(unsigned long pfn) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_atomic(void *addr) { kunmap_local_indexed(addr); pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } unsigned int __nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; static inline unsigned int nr_free_highpages(void) { return __nr_free_highpages(); } static inline unsigned long totalhigh_pages(void) { return (unsigned long)atomic_long_read(&_totalhigh_pages); } static inline void totalhigh_pages_add(long count) { atomic_long_add(count, &_totalhigh_pages); } #else /* CONFIG_HIGHMEM */ static inline struct page *kmap_to_page(void *addr) { return virt_to_page(addr); } static inline void *kmap(struct page *page) { might_sleep(); return page_address(page); } static inline void kunmap_high(struct page *page) { } static inline void kmap_flush_unused(void) { } static inline void kunmap(struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } static inline void *kmap_local_page(struct page *page) { return page_address(page); } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return kmap_local_page(page); } static inline void *kmap_local_pfn(unsigned long pfn) { return kmap_local_page(pfn_to_page(pfn)); } static inline void __kunmap_local(void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } static inline void *kmap_atomic(struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return page_address(page); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { return kmap_atomic(page); } static inline void *kmap_atomic_pfn(unsigned long pfn) { return kmap_atomic(pfn_to_page(pfn)); } static inline void __kunmap_atomic(void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } static inline unsigned int nr_free_highpages(void) { return 0; } static inline unsigned long totalhigh_pages(void) { return 0UL; } #endif /* CONFIG_HIGHMEM */ /* * Prevent people trying to call kunmap_atomic() as if it were kunmap() * kunmap_atomic() should get the return value of kmap_atomic, not the page. */ #define kunmap_atomic(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_atomic(__addr); \ } while (0) #define kunmap_local(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_local(__addr); \ } while (0) #endif
26 564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 // SPDX-License-Identifier: GPL-2.0-only /* Expectation handling for nf_conntrack. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (c) 2005-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> #include <linux/siphash.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); struct hlist_head *nf_ct_expect_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static siphash_key_t nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); struct nf_conntrack_net *cnet; WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); cnet = nf_ct_pernet(net); cnet->expect_count--; hlist_del_rcu(&exp->lnode); master_help->expecting[exp->class]--; nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); } EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_put(exp); } static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { struct { union nf_inet_addr dst_addr; u32 net_mix; u16 dport; u8 l3num; u8 protonum; } __aligned(SIPHASH_ALIGNMENT) combined; u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); memset(&combined, 0, sizeof(combined)); combined.dst_addr = tuple->dst.u3; combined.net_mix = net_hash_mix(n); combined.dport = (__force __u16)tuple->dst.u.all; combined.l3num = tuple->src.l3num; combined.protonum = tuple->dst.protonum; hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } static bool nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_expect *i, const struct nf_conntrack_zone *zone, const struct net *net) { return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && net_eq(net, nf_ct_net(i->master)) && nf_ct_zone_equal_any(i->master, zone); } bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; } return false; } EXPORT_SYMBOL_GPL(nf_ct_remove_expect); struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; } EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; rcu_read_lock(); i = __nf_ct_expect_find(net, zone, tuple); if (i && !refcount_inc_not_zero(&i->use)) i = NULL; rcu_read_unlock(); return i; } EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } } if (!exp) return NULL; /* If master is not in hash table yet (ie. packet hasn't left this machine yet), how can other end know about expected? Hence these are not the droids you are looking for (if master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (!nf_ct_is_confirmed(exp->master)) return NULL; /* Avoid race with other CPUs, that for exp->master ct, is * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } /* Undo exp->master refcnt increase, if del_timer() failed */ nf_ct_put(exp->master); return NULL; } /* delete all expectations for this conntrack */ void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; struct hlist_node *next; /* Optimization: most connection never expect any others. */ if (!help) return; spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { nf_ct_remove_expect(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); /* Would two expected things clash? */ static inline int expect_clash(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { /* Part covered by intersection of masks must be unequal, otherwise they clash */ struct nf_conntrack_tuple_mask intersect_mask; int count; intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ intersect_mask.src.u3.all[count] = a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { return nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static bool master_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b, unsigned int flags) { if (flags & NF_CT_EXP_F_SKIP_MASTER) return true; return a->master == b->master; } /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_remove_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); /* We don't increase the master conntrack refcount for non-fulfilled * conntracks. During the conntrack destruction, the expectations are * always killed before the conntrack itself */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) { struct nf_conntrack_expect *new; new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC); if (!new) return NULL; new->master = me; refcount_set(&new->use, 1); return new; } EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, u_int8_t family, const union nf_inet_addr *saddr, const union nf_inet_addr *daddr, u_int8_t proto, const __be16 *src, const __be16 *dst) { int len; if (family == AF_INET) len = 4; else len = 16; exp->flags = 0; exp->class = class; exp->expectfn = NULL; exp->helper = NULL; exp->tuple.src.l3num = family; exp->tuple.dst.protonum = proto; if (saddr) { memcpy(&exp->tuple.src.u3, saddr, len); if (sizeof(exp->tuple.src.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.src.u3 + len, 0x00, sizeof(exp->tuple.src.u3) - len); memset(&exp->mask.src.u3, 0xFF, len); if (sizeof(exp->mask.src.u3) > len) memset((void *)&exp->mask.src.u3 + len, 0x00, sizeof(exp->mask.src.u3) - len); } else { memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3)); memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3)); } if (src) { exp->tuple.src.u.all = *src; exp->mask.src.u.all = htons(0xFFFF); } else { exp->tuple.src.u.all = 0; exp->mask.src.u.all = 0; } memcpy(&exp->tuple.dst.u3, daddr, len); if (sizeof(exp->tuple.dst.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.dst.u3 + len, 0x00, sizeof(exp->tuple.dst.u3) - len); exp->tuple.dst.u.all = *dst; #if IS_ENABLED(CONFIG_NF_NAT) memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); #endif } EXPORT_SYMBOL_GPL(nf_ct_expect_init); static void nf_ct_expect_free_rcu(struct rcu_head *head) { struct nf_conntrack_expect *exp; exp = container_of(head, struct nf_conntrack_expect, rcu); kmem_cache_free(nf_ct_expect_cachep, exp); } void nf_ct_expect_put(struct nf_conntrack_expect *exp) { if (refcount_dec_and_test(&exp->use)) call_rcu(&exp->rcu, nf_ct_expect_free_rcu); } EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conntrack_net *cnet; struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ refcount_add(2, &exp->use); timer_setup(&exp->timeout, nf_ct_expectation_timed_out, 0); helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { exp->timeout.expires = jiffies + helper->expect_policy[exp->class].timeout * HZ; } add_timer(&exp->timeout); hlist_add_head_rcu(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); cnet = nf_ct_pernet(net); cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); } /* Race with expectations being used means we could have none to find; OK. */ static void evict_oldest_expect(struct nf_conn *master, struct nf_conntrack_expect *new) { struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_expect *exp, *last = NULL; hlist_for_each_entry(exp, &master_help->expectations, lnode) { if (exp->class == new->class) last = exp; } if (last) nf_ct_remove_expect(last); } static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, unsigned int flags) { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; struct nf_conntrack_net *cnet; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); struct hlist_node *next; unsigned int h; int ret = 0; if (!master_help) { ret = -ESHUTDOWN; goto out; } h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (master_matches(i, expect, flags) && expect_matches(i, expect)) { if (i->class != expect->class || i->master != expect->master) return -EALREADY; if (nf_ct_remove_expect(i)) break; } else if (expect_clash(i, expect)) { ret = -EBUSY; goto out; } } /* Will be over limit? */ helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { p = &helper->expect_policy[expect->class]; if (p->max_expected && master_help->expecting[expect->class] >= p->max_expected) { evict_oldest_expect(master, expect); if (master_help->expecting[expect->class] >= p->max_expected) { ret = -EMFILE; goto out; } } } cnet = nf_ct_pernet(net); if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } out: return ret; } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report, unsigned int flags) { int ret; spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect, flags); if (ret < 0) goto out; nf_ct_expect_insert(expect); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return 0; out: spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_destroy); void nf_ct_expect_iterate_net(struct net *net, bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data, u32 portid, int report) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (!net_eq(nf_ct_exp_net(exp), net)) continue; if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_net); #ifdef CONFIG_NF_CONNTRACK_PROCFS struct ct_expect_iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } return NULL; } static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head = ct_expect_get_first(seq); if (head) while (pos && (head = ct_expect_get_next(seq, head))) pos--; return pos ? NULL : head; } static void *exp_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return ct_expect_get_idx(seq, *pos); } static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return ct_expect_get_next(seq, v); } static void exp_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int exp_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_expect *expect; struct nf_conntrack_helper *helper; struct hlist_node *n = v; char *delim = ""; expect = hlist_entry(n, struct nf_conntrack_expect, hnode); if (expect->timeout.function) seq_printf(s, "%ld ", timer_pending(&expect->timeout) ? (long)(expect->timeout.expires - jiffies)/HZ : 0); else seq_puts(s, "- "); seq_printf(s, "l3proto = %u proto=%u ", expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, nf_ct_l4proto_find(expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { seq_puts(s, "PERMANENT"); delim = ","; } if (expect->flags & NF_CT_EXPECT_INACTIVE) { seq_printf(s, "%sINACTIVE", delim); delim = ","; } if (expect->flags & NF_CT_EXPECT_USERSPACE) seq_printf(s, "%sUSERSPACE", delim); helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); if (helper->expect_policy[expect->class].name[0]) seq_printf(s, "/%s", helper->expect_policy[expect->class].name); } seq_putc(s, '\n'); return 0; } static const struct seq_operations exp_seq_ops = { .start = exp_seq_start, .next = exp_seq_next, .stop = exp_seq_stop, .show = exp_seq_show }; #endif /* CONFIG_NF_CONNTRACK_PROCFS */ static int exp_proc_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; proc = proc_create_net("nf_conntrack_expect", 0440, net->proc_net, &exp_seq_ops, sizeof(struct ct_expect_iter_state)); if (!proc) return -ENOMEM; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ return 0; } static void exp_proc_remove(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS remove_proc_entry("nf_conntrack_expect", net->proc_net); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ } module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); } int nf_conntrack_expect_init(void) { if (!nf_ct_expect_hsize) { nf_ct_expect_hsize = nf_conntrack_htable_size / 256; if (!nf_ct_expect_hsize) nf_ct_expect_hsize = 1; } nf_ct_expect_max = nf_ct_expect_hsize * 4; nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", sizeof(struct nf_conntrack_expect), 0, 0, NULL); if (!nf_ct_expect_cachep) return -ENOMEM; nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (!nf_ct_expect_hash) { kmem_cache_destroy(nf_ct_expect_cachep); return -ENOMEM; } return 0; } void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); kvfree(nf_ct_expect_hash); }
26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 /* * An async IO implementation for Linux * Written by Benjamin LaHaise <bcrl@kvack.org> * * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ #define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/aio_abi.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/backing-dev.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> #include <linux/highmem.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> #include <linux/migrate.h> #include <linux/ramfs.h> #include <linux/percpu-refcount.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include "internal.h" #define KIOCB_KEY 0 #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; unsigned compat_features; unsigned incompat_features; unsigned header_length; /* size of aio_ring */ struct io_event io_events[]; }; /* 128 bytes + ring size */ /* * Plugging is meant to work with larger batches of IOs. If we don't * have more than the below, then don't bother setting up a plug. */ #define AIO_PLUG_THRESHOLD 2 #define AIO_RING_PAGES 8 struct kioctx_table { struct rcu_head rcu; unsigned nr; struct kioctx __rcu *table[]; }; struct kioctx_cpu { unsigned reqs_available; }; struct ctx_rq_wait { struct completion comp; atomic_t count; }; struct kioctx { struct percpu_ref users; atomic_t dead; struct percpu_ref reqs; unsigned long user_id; struct __percpu kioctx_cpu *cpu; /* * For percpu reqs_available, number of slots we move to/from global * counter at a time: */ unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. * * The real limit is nr_events - 1, which will be larger (see * aio_setup_ring()) */ unsigned max_reqs; /* Size of ringbuffer, in units of struct io_event */ unsigned nr_events; unsigned long mmap_base; unsigned long mmap_size; struct page **ring_pages; long nr_pages; struct rcu_work free_rwork; /* see free_ioctx() */ /* * signals when all in-flight requests are done */ struct ctx_rq_wait *rq_wait; struct { /* * This counts the number of available slots in the ringbuffer, * so we avoid overflowing it: it's decremented (if positive) * when allocating a kiocb and incremented when the resulting * io_event is pulled off the ringbuffer. * * We batch accesses to it with a percpu version. */ atomic_t reqs_available; } ____cacheline_aligned_in_smp; struct { spinlock_t ctx_lock; struct list_head active_reqs; /* used for cancellation */ } ____cacheline_aligned_in_smp; struct { struct mutex ring_lock; wait_queue_head_t wait; } ____cacheline_aligned_in_smp; struct { unsigned tail; unsigned completed_events; spinlock_t completion_lock; } ____cacheline_aligned_in_smp; struct page *internal_pages[AIO_RING_PAGES]; struct file *aio_ring_file; unsigned id; }; /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in <linux/fs.h> */ struct fsync_iocb { struct file *file; struct work_struct work; bool datasync; struct cred *creds; }; struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; bool cancelled; bool work_scheduled; bool work_need_resched; struct wait_queue_entry wait; struct work_struct work; }; /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can * access the file pointer through any of the sub-structs, * or directly as just 'ki_filp' in this struct. */ struct aio_kiocb { union { struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll; }; struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ refcount_t ki_refcnt; /* * If the aio_resfd field of the userspace iocb is not zero, * this is the underlying eventfd context to deliver events to. */ struct eventfd_ctx *ki_eventfd; }; /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); unsigned long aio_nr; /* current system wide number of aio requests */ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops; static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file; struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; inode->i_mapping->private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; file = alloc_file_pseudo(inode, aio_mnt, "[aio]", O_RDWR, &aio_ring_fops); if (IS_ERR(file)) iput(inode); return file; } static int aio_init_fs_context(struct fs_context *fc) { if (!init_pseudo(fc, AIO_RING_MAGIC)) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; return 0; } /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. */ static int __init aio_setup(void) { static struct file_system_type aio_fs = { .name = "aio", .init_fs_context = aio_init_fs_context, .kill_sb = kill_anon_super, }; aio_mnt = kern_mount(&aio_fs); if (IS_ERR(aio_mnt)) panic("Failed to create aio fs mount."); kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); return 0; } __initcall(aio_setup); static void put_aio_ring_file(struct kioctx *ctx) { struct file *aio_ring_file = ctx->aio_ring_file; struct address_space *i_mapping; if (aio_ring_file) { truncate_setsize(file_inode(aio_ring_file), 0); /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; spin_lock(&i_mapping->private_lock); i_mapping->private_data = NULL; ctx->aio_ring_file = NULL; spin_unlock(&i_mapping->private_lock); fput(aio_ring_file); } } static void aio_free_ring(struct kioctx *ctx) { int i; /* Disconnect the kiotx from the ring file. This prevents future * accesses to the kioctx from page migration. */ put_aio_ring_file(ctx); for (i = 0; i < ctx->nr_pages; i++) { struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, page_count(ctx->ring_pages[i])); page = ctx->ring_pages[i]; if (!page) continue; ctx->ring_pages[i] = NULL; put_page(page); } if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { kfree(ctx->ring_pages); ctx->ring_pages = NULL; } } static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); if (!table) goto out_unlock; for (i = 0; i < table->nr; i++) { struct kioctx *ctx; ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { if (!atomic_read(&ctx->dead)) { ctx->user_id = ctx->mmap_base = vma->vm_start; res = 0; } break; } } out_unlock: rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); return res; } static const struct vm_operations_struct aio_ring_vm_ops = { .mremap = aio_ring_mremap, #if IS_ENABLED(CONFIG_MMU) .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, #endif }; static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { vma->vm_flags |= VM_DONTEXPAND; vma->vm_ops = &aio_ring_vm_ops; return 0; } static const struct file_operations aio_ring_fops = { .mmap = aio_ring_mmap, }; #if IS_ENABLED(CONFIG_MIGRATION) static int aio_migratepage(struct address_space *mapping, struct page *new, struct page *old, enum migrate_mode mode) { struct kioctx *ctx; unsigned long flags; pgoff_t idx; int rc; /* * We cannot support the _NO_COPY case here, because copy needs to * happen under the ctx->completion_lock. That does not work with the * migration workflow of MIGRATE_SYNC_NO_COPY. */ if (mode == MIGRATE_SYNC_NO_COPY) return -EINVAL; rc = 0; /* mapping->private_lock here protects against the kioctx teardown. */ spin_lock(&mapping->private_lock); ctx = mapping->private_data; if (!ctx) { rc = -EINVAL; goto out; } /* The ring_lock mutex. The prevents aio_read_events() from writing * to the ring's head, and prevents page migration from mucking in * a partially initialized kiotx. */ if (!mutex_trylock(&ctx->ring_lock)) { rc = -EAGAIN; goto out; } idx = old->index; if (idx < (pgoff_t)ctx->nr_pages) { /* Make sure the old page hasn't already been changed */ if (ctx->ring_pages[idx] != old) rc = -EAGAIN; } else rc = -EINVAL; if (rc != 0) goto out_unlock; /* Writeback must be complete */ BUG_ON(PageWriteback(old)); get_page(new); rc = migrate_page_move_mapping(mapping, new, old, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); goto out_unlock; } /* Take completion_lock to prevent other writes to the ring buffer * while the old page is copied to the new. This prevents new * events from being lost. */ spin_lock_irqsave(&ctx->completion_lock, flags); migrate_page_copy(new, old); BUG_ON(ctx->ring_pages[idx] != old); ctx->ring_pages[idx] = new; spin_unlock_irqrestore(&ctx->completion_lock, flags); /* The old page is no longer accessible. */ put_page(old); out_unlock: mutex_unlock(&ctx->ring_lock); out: spin_unlock(&mapping->private_lock); return rc; } #endif static const struct address_space_operations aio_ctx_aops = { .set_page_dirty = __set_page_dirty_no_writeback, #if IS_ENABLED(CONFIG_MIGRATION) .migratepage = aio_migratepage, #endif }; static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) { struct aio_ring *ring; struct mm_struct *mm = current->mm; unsigned long size, unused; int nr_pages; int i; struct file *file; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; nr_pages = PFN_UP(size); if (nr_pages < 0) return -EINVAL; file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; return -ENOMEM; } ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!ctx->ring_pages) { put_aio_ring_file(ctx); return -ENOMEM; } } for (i = 0; i < nr_pages; i++) { struct page *page; page = find_or_create_page(file->f_mapping, i, GFP_HIGHUSER | __GFP_ZERO); if (!page) break; pr_debug("pid(%d) page[%d]->count=%d\n", current->pid, i, page_count(page)); SetPageUptodate(page); unlock_page(page); ctx->ring_pages[i] = page; } ctx->nr_pages = i; if (unlikely(i != nr_pages)) { aio_free_ring(ctx); return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); if (mmap_write_lock_killable(mm)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -EINTR; } ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 0, &unused, NULL); mmap_write_unlock(mm); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); return 0; } #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); struct kioctx *ctx = req->ki_ctx; unsigned long flags; if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return; spin_lock_irqsave(&ctx->ctx_lock, flags); list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; spin_unlock_irqrestore(&ctx->ctx_lock, flags); } EXPORT_SYMBOL(kiocb_set_cancel_fn); /* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call * aio_free_ring(). Use rcu_work. */ static void free_ioctx(struct work_struct *work) { struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx, free_rwork); pr_debug("freeing %p\n", ctx); aio_free_ring(ctx); free_percpu(ctx->cpu); percpu_ref_exit(&ctx->reqs); percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); } static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); /* At this point we know that there are no any in-flight requests */ if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) complete(&ctx->rq_wait->comp); /* Synchronize against RCU protected table->table[] dereferences */ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); queue_rcu_work(system_wq, &ctx->free_rwork); } /* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ static void free_ioctx_users(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, users); struct aio_kiocb *req; spin_lock_irq(&ctx->ctx_lock); while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); req->ki_cancel(&req->rw); list_del_init(&req->ki_list); } spin_unlock_irq(&ctx->ctx_lock); percpu_ref_kill(&ctx->reqs); percpu_ref_put(&ctx->reqs); } static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) { unsigned i, new_nr; struct kioctx_table *table, *old; struct aio_ring *ring; spin_lock(&mm->ioctx_lock); table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) for (i = 0; i < table->nr; i++) if (!rcu_access_pointer(table->table[i])) { ctx->id = i; rcu_assign_pointer(table->table[i], ctx); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, * we are protected from page migration * changes ring_pages by ->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); ring->id = ctx->id; kunmap_atomic(ring); return 0; } new_nr = (table ? table->nr : 1) * 4; spin_unlock(&mm->ioctx_lock); table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * new_nr, GFP_KERNEL); if (!table) return -ENOMEM; table->nr = new_nr; spin_lock(&mm->ioctx_lock); old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); } else if (table->nr > old->nr) { memcpy(table->table, old->table, old->nr * sizeof(struct kioctx *)); rcu_assign_pointer(mm->ioctx_table, table); kfree_rcu(old, rcu); } else { kfree(table); table = old; } } } static void aio_nr_sub(unsigned nr) { spin_lock(&aio_nr_lock); if (WARN_ON(aio_nr - nr > aio_nr)) aio_nr = 0; else aio_nr -= nr; spin_unlock(&aio_nr_lock); } /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ static struct kioctx *ioctx_alloc(unsigned nr_events) { struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; /* * Store the original nr_events -- what userspace passed to io_setup(), * for counting against the global limit -- before it changes. */ unsigned int max_reqs = nr_events; /* * We keep track of the number of available ringbuffer slots, to prevent * overflow (reqs_available), and we also use percpu counters for this. * * So since up to half the slots might be on other cpu's percpu counters * and unavailable, double nr_events so userspace sees what they * expected: additionally, we move req_batch slots to/from percpu * counters at a time, so make sure that isn't 0: */ nr_events = max(nr_events, num_possible_cpus() * 4); nr_events *= 2; /* Prevent overflows */ if (nr_events > (0x10000000U / sizeof(struct io_event))) { pr_debug("ENOMEM: nr_events too high\n"); return ERR_PTR(-EINVAL); } if (!nr_events || (unsigned long)max_reqs > aio_max_nr) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); ctx->max_reqs = max_reqs; spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); /* Protect against page migration throughout kiotx setup by keeping * the ring_lock mutex held until setup is complete. */ mutex_lock(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) goto err; err = aio_setup_ring(ctx, nr_events); if (err < 0) goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); if (ctx->req_batch < 1) ctx->req_batch = 1; /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + ctx->max_reqs > aio_max_nr || aio_nr + ctx->max_reqs < aio_nr) { spin_unlock(&aio_nr_lock); err = -EAGAIN; goto err_ctx; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ err = ioctx_add_table(ctx, mm); if (err) goto err_cleanup; /* Release the ring_lock mutex now that all setup is complete. */ mutex_unlock(&ctx->ring_lock); pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; err_cleanup: aio_nr_sub(ctx->max_reqs); err_ctx: atomic_set(&ctx->dead, 1); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); aio_free_ring(ctx); err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); percpu_ref_exit(&ctx->reqs); percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } /* kill_ioctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, struct ctx_rq_wait *wait) { struct kioctx_table *table; spin_lock(&mm->ioctx_lock); if (atomic_xchg(&ctx->dead, 1)) { spin_unlock(&mm->ioctx_lock); return -EINVAL; } table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); RCU_INIT_POINTER(table->table[ctx->id], NULL); spin_unlock(&mm->ioctx_lock); /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); /* * It'd be more correct to do this in free_ioctx(), after all * the outstanding kiocbs have finished - but by then io_destroy * has already returned, so io_setup() could potentially return * -EAGAIN with no ioctxs actually in use (as far as userspace * could tell). */ aio_nr_sub(ctx->max_reqs); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); ctx->rq_wait = wait; percpu_ref_kill(&ctx->users); return 0; } /* * exit_aio: called when the last user of mm goes away. At this point, there is * no way for any new requests to be submited or any of the io_* syscalls to be * called on the context. * * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); struct ctx_rq_wait wait; int i, skipped; if (!table) return; atomic_set(&wait.count, table->nr); init_completion(&wait.comp); skipped = 0; for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = rcu_dereference_protected(table->table[i], true); if (!ctx) { skipped++; continue; } /* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, * this is not necessarily our ->mm. * Since kill_ioctx() uses non-zero ->mmap_size as indicator * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; kill_ioctx(mm, ctx, &wait); } if (!atomic_sub_and_test(skipped, &wait.count)) { /* Wait until all IO for the context are done. */ wait_for_completion(&wait.comp); } RCU_INIT_POINTER(mm->ioctx_table, NULL); kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; unsigned long flags; local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); kcpu->reqs_available += nr; while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } local_irq_restore(flags); } static bool __get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; unsigned long flags; local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; old = avail; avail = atomic_cmpxchg(&ctx->reqs_available, avail, avail - ctx->req_batch); } while (avail != old); kcpu->reqs_available += ctx->req_batch; } ret = true; kcpu->reqs_available--; out: local_irq_restore(flags); return ret; } /* refill_reqs_available * Updates the reqs_available reference counts used for tracking the * number of free slots in the completion ring. This can be called * from aio_complete() (to optimistically update reqs_available) or * from aio_get_req() (the we're out of events case). It must be * called holding ctx->completion_lock. */ static void refill_reqs_available(struct kioctx *ctx, unsigned head, unsigned tail) { unsigned events_in_ring, completed; /* Clamp head since userland can write to it. */ head %= ctx->nr_events; if (head <= tail) events_in_ring = tail - head; else events_in_ring = ctx->nr_events - (head - tail); completed = ctx->completed_events; if (events_in_ring < completed) completed -= events_in_ring; else completed = 0; if (!completed) return; ctx->completed_events -= completed; put_reqs_available(ctx, completed); } /* user_refill_reqs_available * Called to refill reqs_available when aio_get_req() encounters an * out of space in the completion ring. */ static void user_refill_reqs_available(struct kioctx *ctx) { spin_lock_irq(&ctx->completion_lock); if (ctx->completed_events) { struct aio_ring *ring; unsigned head; /* Access of ring->head may race with aio_read_events_ring() * here, but that's okay since whether we read the old version * or the new version, and either will be valid. The important * part is that head cannot pass tail since we prevent * aio_complete() from updating tail by holding * ctx->completion_lock. Even if head is invalid, the check * against ctx->completed_events below will make sure we do the * safe/right thing. */ ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; kunmap_atomic(ring); refill_reqs_available(ctx, head, ctx->tail); } spin_unlock_irq(&ctx->completion_lock); } static bool get_reqs_available(struct kioctx *ctx) { if (__get_reqs_available(ctx)) return true; user_refill_reqs_available(ctx); return __get_reqs_available(ctx); } /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. * * The refcount is initialized to 2 - one for the async op completion, * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { struct aio_kiocb *req; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) return NULL; if (unlikely(!get_reqs_available(ctx))) { kmem_cache_free(kiocb_cachep, req); return NULL; } percpu_ref_get(&ctx->reqs); req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); refcount_set(&req->ki_refcnt, 2); req->ki_eventfd = NULL; return req; } static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; struct kioctx_table *table; unsigned id; if (get_user(id, &ring->id)) return NULL; rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); if (!table || id >= table->nr) goto out; id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) ret = ctx; } out: rcu_read_unlock(); return ret; } static inline void iocb_destroy(struct aio_kiocb *iocb) { if (iocb->ki_eventfd) eventfd_ctx_put(iocb->ki_eventfd); if (iocb->ki_filp) fput(iocb->ki_filp); percpu_ref_put(&iocb->ki_ctx->reqs); kmem_cache_free(kiocb_cachep, iocb); } /* aio_complete * Called when the io request on the given iocb is complete. */ static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; unsigned tail, pos, head; unsigned long flags; /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context. */ spin_lock_irqsave(&ctx->completion_lock, flags); tail = ctx->tail; pos = tail + AIO_EVENTS_OFFSET; if (++tail >= ctx->nr_events) tail = 0; ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; *event = iocb->ki_res; kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, (void __user *)(unsigned long)iocb->ki_res.obj, iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again */ smp_wmb(); /* make event visible before updating tail */ ctx->tail = tail; ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; ring->tail = tail; kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); /* * Check if the user asked us to deliver the result through an * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ if (iocb->ki_eventfd) eventfd_signal(iocb->ki_eventfd, 1); /* * We have to order our ring_info tail store above and test * of the wait list below outside the wait lock. This is * like in wake_up_bit() where clearing a bit has to be * ordered with the unlocked test. */ smp_mb(); if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); } static inline void iocb_put(struct aio_kiocb *iocb) { if (refcount_dec_and_test(&iocb->ki_refcnt)) { aio_complete(iocb); iocb_destroy(iocb); } } /* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched */ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; unsigned head, tail, pos; long ret = 0; int copy_ret; /* * The mutex can block and wake us up and that will cause * wait_event_interruptible_hrtimeout() to schedule without sleeping * and repeat. This should be rare enough that it doesn't cause * peformance issues. See the comment in read_events() for more detail. */ sched_annotate_sleep(); mutex_lock(&ctx->ring_lock); /* Access to ->ring_pages here is protected by ctx->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; tail = ring->tail; kunmap_atomic(ring); /* * Ensure that once we've read the current tail pointer, that * we also see the events that were stored up to the tail. */ smp_rmb(); pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); if (head == tail) goto out; head %= ctx->nr_events; tail %= ctx->nr_events; while (ret < nr) { long avail; struct io_event *ev; struct page *page; avail = (head <= tail ? tail : ctx->nr_events) - head; if (head == tail) break; pos = head + AIO_EVENTS_OFFSET; page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; pos %= AIO_EVENTS_PER_PAGE; avail = min(avail, nr - ret); avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); ev = kmap(page); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); kunmap(page); if (unlikely(copy_ret)) { ret = -EFAULT; goto out; } ret += avail; head += avail; head %= ctx->nr_events; } ring = kmap_atomic(ctx->ring_pages[0]); ring->head = head; kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, tail); out: mutex_unlock(&ctx->ring_lock); return ret; } static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) { long ret = aio_read_events_ring(ctx, event + *i, nr - *i); if (ret > 0) *i += ret; if (unlikely(atomic_read(&ctx->dead))) ret = -EINVAL; if (!*i) *i = ret; return ret < 0 || *i >= min_nr; } static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { long ret = 0; /* * Note that aio_read_events() is being called as the conditional - i.e. * we're calling it after prepare_to_wait() has set task state to * TASK_INTERRUPTIBLE. * * But aio_read_events() can block, and if it blocks it's going to flip * the task state back to TASK_RUNNING. * * This should be ok, provided it doesn't flip the state back to * TASK_RUNNING and return 0 too much - that causes us to spin. That * will only happen if the mutex_lock() call blocks, and we then find * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ if (until == 0) aio_read_events(ctx, min_nr, nr, event, &ret); else wait_event_interruptible_hrtimeout(ctx->wait, aio_read_events(ctx, min_nr, nr, event, &ret), until); return ret; } /* sys_io_setup: * Create an aio_context capable of receiving at least nr_events. * ctxp must not point to an aio_context that already exists, and * must be initialized to 0 prior to the call. On successful * creation of the aio_context, *ctxp is filled in with the resulting * handle. May fail with -EINVAL if *ctxp is not initialized, * if the specified nr_events exceeds internal limits. May fail * with -EAGAIN if the specified nr_events exceeds the user's limit * of available events. May fail with -ENOMEM if insufficient kernel * resources are available. May fail with -EFAULT if an invalid * pointer is passed for ctxp. Will fail with -ENOSYS if not * implemented. */ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; ret = get_user(ctx, ctxp); if (unlikely(ret)) goto out; ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } out: return ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; ret = get_user(ctx, ctx32p); if (unlikely(ret)) goto out; ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { /* truncating is ok because it's a user address */ ret = put_user((u32)ioctx->user_id, ctx32p); if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } out: return ret; } #endif /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not * implemented. May fail with -EINVAL if the context pointed to * is invalid. */ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { struct ctx_rq_wait wait; int ret; init_completion(&wait.comp); atomic_set(&wait.count, 1); /* Pass requests_done to kill_ioctx() where it can be set * in a thread-safe way. If we try to set it here then we have * a race condition if two io_destroy() called simultaneously. */ ret = kill_ioctx(current->mm, ioctx, &wait); percpu_ref_put(&ioctx->users); /* Wait until all IO for the context are done. Otherwise kernel * keep using user-space buffers even if user thinks the context * is destroyed. */ if (!ret) wait_for_completion(&wait.comp); return ret; } pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } static void aio_remove_iocb(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; unsigned long flags; spin_lock_irqsave(&ctx->ctx_lock, flags); list_del(&iocb->ki_list); spin_unlock_irqrestore(&ctx->ctx_lock, flags); } static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) { struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); if (!list_empty_careful(&iocb->ki_list)) aio_remove_iocb(iocb); if (kiocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(kiocb->ki_filp); /* * Tell lockdep we inherited freeze protection from submission * thread. */ if (S_ISREG(inode->i_mode)) __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); file_end_write(kiocb->ki_filp); } iocb->ki_res.res = res; iocb->ki_res.res2 = res2; iocb_put(iocb); } static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) { int ret; req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then * aio_reqprio is interpreted as an I/O scheduling * class and priority. */ ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); return ret; } req->ki_ioprio = iocb->aio_reqprio; } else req->ki_ioprio = get_current_ioprio(); ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) return ret; req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ return 0; } static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; size_t len = iocb->aio_nbytes; if (!vectored) { ssize_t ret = import_single_range(rw, buf, len, *iovec, iter); *iovec = NULL; return ret; } return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } static inline void aio_rw_done(struct kiocb *req, ssize_t ret) { switch (ret) { case -EIOCBQUEUED: break; case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: case -ERESTART_RESTARTBLOCK: /* * There's no easy way to restart the syscall since other AIO's * may be already running. Just fail this IO with EINTR. */ ret = -EINTR; fallthrough; default: req->ki_complete(req, ret, 0); } } static int aio_read(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; int ret; ret = aio_prep_rw(req, iocb); if (ret) return ret; file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) return -EINVAL; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) aio_rw_done(req, call_read_iter(file, req, &iter)); kfree(iovec); return ret; } static int aio_write(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; int ret; ret = aio_prep_rw(req, iocb); if (ret) return ret; file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(!file->f_op->write_iter)) return -EINVAL; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { /* * Open-code file_start_write here to grab freeze protection, * which will be released by another thread in * aio_complete_rw(). Fool lockdep by telling it the lock got * released so that it doesn't complain about the held lock when * we return to userspace. */ if (S_ISREG(file_inode(file)->i_mode)) { sb_start_write(file_inode(file)->i_sb); __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); } req->ki_flags |= IOCB_WRITE; aio_rw_done(req, call_write_iter(file, req, &iter)); } kfree(iovec); return ret; } static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); const struct cred *old_cred = override_creds(iocb->fsync.creds); iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); revert_creds(old_cred); put_cred(iocb->fsync.creds); iocb_put(iocb); } static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, bool datasync) { if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; if (unlikely(!req->file->f_op->fsync)) return -EINVAL; req->creds = prepare_creds(); if (!req->creds) return -ENOMEM; req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); schedule_work(&req->work); return 0; } static void aio_poll_put_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); iocb_put(iocb); } /* * Safely lock the waitqueue which the request is on, synchronizing with the * case where the ->poll() provider decides to free its waitqueue early. * * Returns true on success, meaning that req->head->lock was locked, req->wait * is on req->head, and an RCU read lock was taken. Returns false if the * request was already removed from its waitqueue (which might no longer exist). */ static bool poll_iocb_lock_wq(struct poll_iocb *req) { wait_queue_head_t *head; /* * While we hold the waitqueue lock and the waitqueue is nonempty, * wake_up_pollfree() will wait for us. However, taking the waitqueue * lock in the first place can race with the waitqueue being freed. * * We solve this as eventpoll does: by taking advantage of the fact that * all users of wake_up_pollfree() will RCU-delay the actual free. If * we enter rcu_read_lock() and see that the pointer to the queue is * non-NULL, we can then lock it without the memory being freed out from * under us, then check whether the request is still on the queue. * * Keep holding rcu_read_lock() as long as we hold the queue lock, in * case the caller deletes the entry from the queue, leaving it empty. * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); head = smp_load_acquire(&req->head); if (head) { spin_lock(&head->lock); if (!list_empty(&req->wait.entry)) return true; spin_unlock(&head->lock); } rcu_read_unlock(); return false; } static void poll_iocb_unlock_wq(struct poll_iocb *req) { spin_unlock(&req->head->lock); rcu_read_unlock(); } static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); struct poll_table_struct pt = { ._key = req->events }; struct kioctx *ctx = iocb->ki_ctx; __poll_t mask = 0; if (!READ_ONCE(req->cancelled)) mask = vfs_poll(req->file, &pt) & req->events; /* * Note that ->ki_cancel callers also delete iocb from active_reqs after * calling ->ki_cancel. We need the ctx_lock roundtrip here to * synchronize with them. In the cancellation case the list_del_init * itself is not actually needed, but harmless so we keep it in to * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); if (poll_iocb_lock_wq(req)) { if (!mask && !READ_ONCE(req->cancelled)) { /* * The request isn't actually ready to be completed yet. * Reschedule completion if another wakeup came in. */ if (req->work_need_resched) { schedule_work(&req->work); req->work_need_resched = false; } else { req->work_scheduled = false; } poll_iocb_unlock_wq(req); spin_unlock_irq(&ctx->ctx_lock); return; } list_del_init(&req->wait.entry); poll_iocb_unlock_wq(req); } /* else, POLLFREE has freed the waitqueue, so we must complete */ list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); spin_unlock_irq(&ctx->ctx_lock); iocb_put(iocb); } /* assumes we are called with irqs disabled */ static int aio_poll_cancel(struct kiocb *iocb) { struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll; if (poll_iocb_lock_wq(req)) { WRITE_ONCE(req->cancelled, true); if (!req->work_scheduled) { schedule_work(&aiocb->poll.work); req->work_scheduled = true; } poll_iocb_unlock_wq(req); } /* else, the request was force-cancelled by POLLFREE already */ return 0; } static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); __poll_t mask = key_to_poll(key); unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & req->events)) return 0; /* * Complete the request inline if possible. This requires that three * conditions be met: * 1. An event mask must have been passed. If a plain wakeup was done * instead, then mask == 0 and we have to call vfs_poll() to get * the events, so inline completion isn't possible. * 2. The completion work must not have already been scheduled. * 3. ctx_lock must not be busy. We have to use trylock because we * already hold the waitqueue lock, so this inverts the normal * locking order. Use irqsave/irqrestore because not all * filesystems (e.g. fuse) call this function with IRQs disabled, * yet IRQs have to be disabled before ctx_lock is obtained. */ if (mask && !req->work_scheduled && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { struct kioctx *ctx = iocb->ki_ctx; list_del_init(&req->wait.entry); list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); if (iocb->ki_eventfd && !eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); } spin_unlock_irqrestore(&ctx->ctx_lock, flags); if (iocb) iocb_put(iocb); } else { /* * Schedule the completion work if needed. If it was already * scheduled, record that another wakeup came in. * * Don't remove the request from the waitqueue here, as it might * not actually be complete yet (we won't know until vfs_poll() * is called), and we must not miss any wakeups. POLLFREE is an * exception to this; see below. */ if (req->work_scheduled) { req->work_need_resched = true; } else { schedule_work(&req->work); req->work_scheduled = true; } /* * If the waitqueue is being freed early but we can't complete * the request inline, we have to tear down the request as best * we can. That means immediately removing the request from its * waitqueue and preventing all further accesses to the * waitqueue via the request. We also need to schedule the * completion work (done above). Also mark the request as * cancelled, to potentially skip an unneeded call to ->poll(). */ if (mask & POLLFREE) { WRITE_ONCE(req->cancelled, true); list_del_init(&req->wait.entry); /* * Careful: this *must* be the last step, since as soon * as req->head is NULL'ed out, the request can be * completed and freed, since aio_poll_complete_work() * will no longer need to take the waitqueue lock. */ smp_store_release(&req->head, NULL); } } return 1; } struct aio_poll_table { struct poll_table_struct pt; struct aio_kiocb *iocb; bool queued; int error; }; static void aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); /* multiple wait queues per file are not supported */ if (unlikely(pt->queued)) { pt->error = -EINVAL; return; } pt->queued = true; pt->error = 0; pt->iocb->poll.head = head; add_wait_queue(head, &pt->iocb->poll.wait); } static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ if ((u16)iocb->aio_buf != iocb->aio_buf) return -EINVAL; /* reject fields that are not defined for poll */ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) return -EINVAL; INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; req->cancelled = false; req->work_scheduled = false; req->work_need_resched = false; apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; apt.queued = false; apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); mask = vfs_poll(req->file, &apt.pt) & req->events; spin_lock_irq(&ctx->ctx_lock); if (likely(apt.queued)) { bool on_queue = poll_iocb_lock_wq(req); if (!on_queue || req->work_scheduled) { /* * aio_poll_wake() already either scheduled the async * completion work, or completed the request inline. */ if (apt.error) /* unsupported case: multiple queues */ cancel = true; apt.error = 0; mask = 0; } if (mask || apt.error) { /* Steal to complete synchronously. */ list_del_init(&req->wait.entry); } else if (cancel) { /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); } else if (on_queue) { /* * Actually waiting for an event, so add the request to * active_reqs so that it can be cancelled if needed. */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; } if (on_queue) poll_iocb_unlock_wq(req); } if (mask) { /* no async, we'd stolen it */ aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; } spin_unlock_irq(&ctx->ctx_lock); if (mask) iocb_put(aiocb); return apt.error; } static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, struct iocb __user *user_iocb, struct aio_kiocb *req, bool compat) { req->ki_filp = fget(iocb->aio_fildes); if (unlikely(!req->ki_filp)) return -EBADF; if (iocb->aio_flags & IOCB_FLAG_RESFD) { struct eventfd_ctx *eventfd; /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ eventfd = eventfd_ctx_fdget(iocb->aio_resfd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); req->ki_eventfd = eventfd; } if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; } req->ki_res.obj = (u64)(unsigned long)user_iocb; req->ki_res.data = iocb->aio_data; req->ki_res.res = 0; req->ki_res.res2 = 0; switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: return aio_read(&req->rw, iocb, false, compat); case IOCB_CMD_PWRITE: return aio_write(&req->rw, iocb, false, compat); case IOCB_CMD_PREADV: return aio_read(&req->rw, iocb, true, compat); case IOCB_CMD_PWRITEV: return aio_write(&req->rw, iocb, true, compat); case IOCB_CMD_FSYNC: return aio_fsync(&req->fsync, iocb, false); case IOCB_CMD_FDSYNC: return aio_fsync(&req->fsync, iocb, true); case IOCB_CMD_POLL: return aio_poll(req, iocb); default: pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); return -EINVAL; } } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { struct aio_kiocb *req; struct iocb iocb; int err; if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) return -EFAULT; /* enforce forwards compatibility on users */ if (unlikely(iocb.aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( (iocb.aio_buf != (unsigned long)iocb.aio_buf) || (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || ((ssize_t)iocb.aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; } req = aio_get_req(ctx); if (unlikely(!req)) return -EAGAIN; err = __io_submit_one(ctx, &iocb, user_iocb, req, compat); /* Done with the synchronous reference */ iocb_put(req); /* * If err is 0, we'd either done aio_complete() ourselves or have * arranged for that to be done asynchronously. Anything non-zero * means that we need to destroy req ourselves. */ if (unlikely(err)) { iocb_destroy(req); put_reqs_available(ctx, 1); } return err; } /* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context * specified by ctx_id is invalid, if nr is < 0, if the iocb at * *iocbpp[0] is not properly initialized, if the operation specified * is invalid for the file descriptor in the iocb. May fail with * -EFAULT if any of the data structures point to invalid data. May * fail with -EBADF if the file descriptor specified in the first * iocb is invalid. May fail with -EAGAIN if insufficient resources * are available to queue any iocbs. Will return 0 if nr is 0. Will * fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, struct iocb __user * __user *, iocbpp) { struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } if (nr > ctx->nr_events) nr = ctx->nr_events; if (nr > AIO_PLUG_THRESHOLD) blk_start_plug(&plug); for (i = 0; i < nr; i++) { struct iocb __user *user_iocb; if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } ret = io_submit_one(ctx, user_iocb, false); if (ret) break; } if (nr > AIO_PLUG_THRESHOLD) blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, int, nr, compat_uptr_t __user *, iocbpp) { struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } if (nr > ctx->nr_events) nr = ctx->nr_events; if (nr > AIO_PLUG_THRESHOLD) blk_start_plug(&plug); for (i = 0; i < nr; i++) { compat_uptr_t user_iocb; if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } ret = io_submit_one(ctx, compat_ptr(user_iocb), true); if (ret) break; } if (nr > AIO_PLUG_THRESHOLD) blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; } #endif /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is * copied into the memory pointed to by result without being placed * into the completion queue and 0 is returned. May fail with * -EFAULT if any of the data structures pointed to are invalid. * May fail with -EINVAL if aio_context specified by ctx_id is * invalid. May fail with -EAGAIN if the iocb specified was not * cancelled. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { struct kioctx *ctx; struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; if (unlikely(key != KIOCB_KEY)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL; spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); list_del_init(&kiocb->ki_list); break; } } spin_unlock_irq(&ctx->ctx_lock); if (!ret) { /* * The result argument is no longer used - the io_event is * always delivered via the ring buffer. -EINPROGRESS indicates * cancellation is progress: */ ret = -EINPROGRESS; } percpu_ref_put(&ctx->users); return ret; } static long do_io_getevents(aio_context_t ctx_id, long min_nr, long nr, struct io_event __user *events, struct timespec64 *ts) { ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX; struct kioctx *ioctx = lookup_ioctx(ctx_id); long ret = -EINVAL; if (likely(ioctx)) { if (likely(min_nr <= nr && min_nr >= 0)) ret = read_events(ioctx, min_nr, nr, events, until); percpu_ref_put(&ioctx->users); } return ret; } /* io_getevents: * Attempts to read at least min_nr events and up to nr events from * the completion queue for the aio_context specified by ctx_id. If * it succeeds, the number of read events is returned. May fail with * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is * out of range, if timeout is out of range. May fail with -EFAULT * if any of the memory specified is invalid. May return 0 or * < min_nr if the timeout specified by timeout has elapsed * before sufficient events are available, where timeout == NULL * specifies an infinite timeout. Note that the timeout pointed to by * timeout is relative. Will fail with -ENOSYS if not implemented. */ #ifdef CONFIG_64BIT SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout) { struct timespec64 ts; int ret; if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); if (!ret && signal_pending(current)) ret = -EINTR; return ret; } #endif struct __aio_sigset { const sigset_t __user *sigmask; size_t sigsetsize; }; SYSCALL_DEFINE6(io_pgetevents, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; struct timespec64 ts; bool interrupted; int ret; if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(io_pgetevents_time32, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; struct timespec64 ts; bool interrupted; int ret; if (timeout && unlikely(get_old_timespec32(&ts, timeout))) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id, __s32, min_nr, __s32, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout) { struct timespec64 t; int ret; if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); if (!ret && signal_pending(current)) ret = -EINTR; return ret; } #endif #ifdef CONFIG_COMPAT struct __compat_aio_sigset { compat_uptr_t sigmask; compat_size_t sigsetsize; }; #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(io_pgetevents, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; if (timeout && get_timespec64(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif
799 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 // SPDX-License-Identifier: GPL-2.0 /* * ACPI support * * Copyright (C) 2020, Intel Corporation * Author: Mika Westerberg <mika.westerberg@linux.intel.com> */ #include <linux/acpi.h> #include <linux/pm_runtime.h> #include "tb.h" static acpi_status tb_acpi_add_link(acpi_handle handle, u32 level, void *data, void **return_value) { struct fwnode_reference_args args; struct fwnode_handle *fwnode; struct tb_nhi *nhi = data; struct acpi_device *adev; struct pci_dev *pdev; struct device *dev; int ret; if (acpi_bus_get_device(handle, &adev)) return AE_OK; fwnode = acpi_fwnode_handle(adev); ret = fwnode_property_get_reference_args(fwnode, "usb4-host-interface", NULL, 0, 0, &args); if (ret) return AE_OK; /* It needs to reference this NHI */ if (nhi->pdev->dev.fwnode != args.fwnode) goto out_put; /* * Try to find physical device walking upwards to the hierarcy. * We need to do this because the xHCI driver might not yet be * bound so the USB3 SuperSpeed ports are not yet created. */ dev = acpi_get_first_physical_node(adev); while (!dev) { adev = adev->parent; if (!adev) break; dev = acpi_get_first_physical_node(adev); } if (!dev) goto out_put; /* * Check that the device is PCIe. This is because USB3 * SuperSpeed ports have this property and they are not power * managed with the xHCI and the SuperSpeed hub so we create the * link from xHCI instead. */ while (dev && !dev_is_pci(dev)) dev = dev->parent; if (!dev) goto out_put; /* * Check that this actually matches the type of device we * expect. It should either be xHCI or PCIe root/downstream * port. */ pdev = to_pci_dev(dev); if (pdev->class == PCI_CLASS_SERIAL_USB_XHCI || (pci_is_pcie(pdev) && (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM))) { const struct device_link *link; /* * Make them both active first to make sure the NHI does * not runtime suspend before the consumer. The * pm_runtime_put() below then allows the consumer to * runtime suspend again (which then allows NHI runtime * suspend too now that the device link is established). */ pm_runtime_get_sync(&pdev->dev); link = device_link_add(&pdev->dev, &nhi->pdev->dev, DL_FLAG_AUTOREMOVE_SUPPLIER | DL_FLAG_RPM_ACTIVE | DL_FLAG_PM_RUNTIME); if (link) { dev_dbg(&nhi->pdev->dev, "created link from %s\n", dev_name(&pdev->dev)); } else { dev_warn(&nhi->pdev->dev, "device link creation from %s failed\n", dev_name(&pdev->dev)); } pm_runtime_put(&pdev->dev); } out_put: fwnode_handle_put(args.fwnode); return AE_OK; } /** * tb_acpi_add_links() - Add device links based on ACPI description * @nhi: Pointer to NHI * * Goes over ACPI namespace finding tunneled ports that reference to * @nhi ACPI node. For each reference a device link is added. The link * is automatically removed by the driver core. */ void tb_acpi_add_links(struct tb_nhi *nhi) { acpi_status status; if (!has_acpi_companion(&nhi->pdev->dev)) return; /* * Find all devices that have usb4-host-controller interface * property that references to this NHI. */ status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, 32, tb_acpi_add_link, NULL, nhi, NULL); if (ACPI_FAILURE(status)) dev_warn(&nhi->pdev->dev, "failed to enumerate tunneled ports\n"); } /** * tb_acpi_is_native() - Did the platform grant native TBT/USB4 control * * Returns %true if the platform granted OS native control over * TBT/USB4. In this case software based connection manager can be used, * otherwise there is firmware based connection manager running. */ bool tb_acpi_is_native(void) { return osc_sb_native_usb4_support_confirmed && osc_sb_native_usb4_control; } /** * tb_acpi_may_tunnel_usb3() - Is USB3 tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native USB3 tunneling. */ bool tb_acpi_may_tunnel_usb3(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_USB3_TUNNELING; return true; } /** * tb_acpi_may_tunnel_dp() - Is DisplayPort tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native DP tunneling. */ bool tb_acpi_may_tunnel_dp(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_DP_TUNNELING; return true; } /** * tb_acpi_may_tunnel_pcie() - Is PCIe tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native PCIe tunneling. */ bool tb_acpi_may_tunnel_pcie(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_PCIE_TUNNELING; return true; } /** * tb_acpi_is_xdomain_allowed() - Are XDomain connections allowed * * When software based connection manager is used, this function * returns %true if platform allows XDomain connections. */ bool tb_acpi_is_xdomain_allowed(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_XDOMAIN; return true; } /* UUID for retimer _DSM: e0053122-795b-4122-8a5e-57be1d26acb3 */ static const guid_t retimer_dsm_guid = GUID_INIT(0xe0053122, 0x795b, 0x4122, 0x8a, 0x5e, 0x57, 0xbe, 0x1d, 0x26, 0xac, 0xb3); #define RETIMER_DSM_QUERY_ONLINE_STATE 1 #define RETIMER_DSM_SET_ONLINE_STATE 2 static int tb_acpi_retimer_set_power(struct tb_port *port, bool power) { struct usb4_port *usb4 = port->usb4; union acpi_object argv4[2]; struct acpi_device *adev; union acpi_object *obj; int ret; if (!usb4->can_offline) return 0; adev = ACPI_COMPANION(&usb4->dev); if (WARN_ON(!adev)) return 0; /* Check if we are already powered on (and in correct mode) */ obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1, RETIMER_DSM_QUERY_ONLINE_STATE, NULL, ACPI_TYPE_INTEGER); if (!obj) { tb_port_warn(port, "ACPI: query online _DSM failed\n"); return -EIO; } ret = obj->integer.value; ACPI_FREE(obj); if (power == ret) return 0; tb_port_dbg(port, "ACPI: calling _DSM to power %s retimers\n", power ? "on" : "off"); argv4[0].type = ACPI_TYPE_PACKAGE; argv4[0].package.count = 1; argv4[0].package.elements = &argv4[1]; argv4[1].integer.type = ACPI_TYPE_INTEGER; argv4[1].integer.value = power; obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1, RETIMER_DSM_SET_ONLINE_STATE, argv4, ACPI_TYPE_INTEGER); if (!obj) { tb_port_warn(port, "ACPI: set online state _DSM evaluation failed\n"); return -EIO; } ret = obj->integer.value; ACPI_FREE(obj); if (ret >= 0) { if (power) return ret == 1 ? 0 : -EBUSY; return 0; } tb_port_warn(port, "ACPI: set online state _DSM failed with error %d\n", ret); return -EIO; } /** * tb_acpi_power_on_retimers() - Call platform to power on retimers * @port: USB4 port * * Calls platform to turn on power to all retimers behind this USB4 * port. After this function returns successfully the caller can * continue with the normal retimer flows (as specified in the USB4 * spec). Note if this returns %-EBUSY it means the type-C port is in * non-USB4/TBT mode (there is non-USB4/TBT device connected). * * This should only be called if the USB4/TBT link is not up. * * Returns %0 on success. */ int tb_acpi_power_on_retimers(struct tb_port *port) { return tb_acpi_retimer_set_power(port, true); } /** * tb_acpi_power_off_retimers() - Call platform to power off retimers * @port: USB4 port * * This is the opposite of tb_acpi_power_on_retimers(). After returning * successfully the normal operations with the @port can continue. * * Returns %0 on success. */ int tb_acpi_power_off_retimers(struct tb_port *port) { return tb_acpi_retimer_set_power(port, false); } static bool tb_acpi_bus_match(struct device *dev) { return tb_is_switch(dev) || tb_is_usb4_port_device(dev); } static struct acpi_device *tb_acpi_find_port(struct acpi_device *adev, const struct tb_port *port) { struct acpi_device *port_adev; if (!adev) return NULL; /* * Device routers exists under the downstream facing USB4 port * of the parent router. Their _ADR is always 0. */ list_for_each_entry(port_adev, &adev->children, node) { if (acpi_device_adr(port_adev) == port->port) return port_adev; } return NULL; } static struct acpi_device *tb_acpi_switch_find_companion(struct tb_switch *sw) { struct acpi_device *adev = NULL; struct tb_switch *parent_sw; parent_sw = tb_switch_parent(sw); if (parent_sw) { struct tb_port *port = tb_port_at(tb_route(sw), parent_sw); struct acpi_device *port_adev; port_adev = tb_acpi_find_port(ACPI_COMPANION(&parent_sw->dev), port); if (port_adev) adev = acpi_find_child_device(port_adev, 0, false); } else { struct tb_nhi *nhi = sw->tb->nhi; struct acpi_device *parent_adev; parent_adev = ACPI_COMPANION(&nhi->pdev->dev); if (parent_adev) adev = acpi_find_child_device(parent_adev, 0, false); } return adev; } static struct acpi_device *tb_acpi_find_companion(struct device *dev) { /* * The Thunderbolt/USB4 hierarchy looks like following: * * Device (NHI) * Device (HR) // Host router _ADR == 0 * Device (DFP0) // Downstream port _ADR == lane 0 adapter * Device (DR) // Device router _ADR == 0 * Device (UFP) // Upstream port _ADR == lane 0 adapter * Device (DFP1) // Downstream port _ADR == lane 0 adapter number * * At the moment we bind the host router to the corresponding * Linux device. */ if (tb_is_switch(dev)) return tb_acpi_switch_find_companion(tb_to_switch(dev)); else if (tb_is_usb4_port_device(dev)) return tb_acpi_find_port(ACPI_COMPANION(dev->parent), tb_to_usb4_port_device(dev)->port); return NULL; } static void tb_acpi_setup(struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); struct usb4_port *usb4 = tb_to_usb4_port_device(dev); if (!adev || !usb4) return; if (acpi_check_dsm(adev->handle, &retimer_dsm_guid, 1, BIT(RETIMER_DSM_QUERY_ONLINE_STATE) | BIT(RETIMER_DSM_SET_ONLINE_STATE))) usb4->can_offline = true; } static struct acpi_bus_type tb_acpi_bus = { .name = "thunderbolt", .match = tb_acpi_bus_match, .find_companion = tb_acpi_find_companion, .setup = tb_acpi_setup, }; int tb_acpi_init(void) { return register_acpi_bus_type(&tb_acpi_bus); } void tb_acpi_exit(void) { unregister_acpi_bus_type(&tb_acpi_bus); }
10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo) { u8 add_addr = READ_ONCE(msk->pm.addr_signal); pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo); lockdep_assert_held(&msk->pm.lock); if (add_addr & (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) { pr_warn("addr_signal error, add_addr=%d, echo=%d", add_addr, echo); return -EINVAL; } if (echo) { msk->pm.remote = *addr; add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); } else { msk->pm.local = *addr; add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); } WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { u8 rm_addr = READ_ONCE(msk->pm.addr_signal); pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); if (rm_addr) { pr_warn("addr_signal error, rm_addr=%d", rm_addr); return -EINVAL; } msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); mptcp_pm_nl_addr_send_ack(msk); return 0; } int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, rm_list); spin_unlock_bh(&msk->pm.lock); return 0; } /* path manager event handlers */ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side); WRITE_ONCE(pm->server_side, server_side); mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); } bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; unsigned int subflows_max; int ret = 0; subflows_max = mptcp_pm_get_subflows_max(msk); pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, subflows_max, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) return false; spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { ret = pm->subflows < subflows_max; if (ret && ++pm->subflows == subflows_max) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); return ret; } /* return true if the new status bit is currently cleared, that is, this event * can be server, eventually by an already scheduled work */ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, enum mptcp_pm_status new_status) { pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status, BIT(new_status)); if (msk->pm.status & BIT(new_status)) return false; msk->pm.status |= BIT(new_status); mptcp_schedule_work((struct sock *)msk); return true; } void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { struct mptcp_pm_data *pm = &msk->pm; bool announce = false; pr_debug("msk=%p", msk); spin_lock_bh(&pm->lock); /* mptcp_pm_fully_established() can be invoked by multiple * racing paths - accept() and check_fully_established() * be sure to serve this event only once. */ if (READ_ONCE(pm->work_pending) && !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) announce = true; msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); spin_unlock_bh(&pm->lock); if (announce) mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, gfp); } void mptcp_pm_connection_closed(struct mptcp_sock *msk) { pr_debug("msk=%p", msk); } void mptcp_pm_subflow_established(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p", msk); if (!READ_ONCE(pm->work_pending)) return; spin_lock_bh(&pm->lock); if (READ_ONCE(pm->work_pending)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); } void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id) { pr_debug("msk=%p", msk); } void mptcp_pm_add_addr_received(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, READ_ONCE(pm->accept_addr)); mptcp_event_addr_announced(msk, addr); spin_lock_bh(&pm->lock); if (!READ_ONCE(pm->accept_addr)) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, struct mptcp_addr_info *addr) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p", msk); spin_lock_bh(&pm->lock); if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) { if (!mptcp_pm_should_add_signal(msk)) return; mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); } void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { struct mptcp_pm_data *pm = &msk->pm; u8 i; pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr); for (i = 0; i < rm_list->nr; i++) mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) pm->rm_list_rx = *rm_list; else __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); spin_unlock_bh(&pm->lock); } void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); subflow->backup = bkup; mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC); } void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { pr_debug("fail_seq=%llu", fail_seq); } /* path manager helpers */ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, bool *port, bool *drop_other_suboptions) { int ret = false; u8 add_addr; u8 family; spin_lock_bh(&msk->pm.lock); /* double check after the lock is acquired */ if (!mptcp_pm_should_add_signal(msk)) goto out_unlock; /* always drop every other options for pure ack ADD_ADDR; this is a * plain dup-ack from TCP perspective. The other MPTCP-relevant info, * if any, will be carried by the 'original' TCP ack */ if (skb && skb_is_tcp_pure_ack(skb)) { remaining += opt_size; *drop_other_suboptions = true; } *echo = mptcp_pm_should_add_signal_echo(msk); *port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); family = *echo ? msk->pm.remote.family : msk->pm.local.family; if (remaining < mptcp_add_addr_len(family, *echo, *port)) goto out_unlock; if (*echo) { *addr = msk->pm.remote; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO); } else { *addr = msk->pm.local; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL); } WRITE_ONCE(msk->pm.addr_signal, add_addr); ret = true; out_unlock: spin_unlock_bh(&msk->pm.lock); return ret; } bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list) { int ret = false, len; u8 rm_addr; spin_lock_bh(&msk->pm.lock); /* double check after the lock is acquired */ if (!mptcp_pm_should_rm_signal(msk)) goto out_unlock; rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL); len = mptcp_rm_addr_len(&msk->pm.rm_list_tx); if (len < 0) { WRITE_ONCE(msk->pm.addr_signal, rm_addr); goto out_unlock; } if (remaining < len) goto out_unlock; *rm_list = msk->pm.rm_list_tx; WRITE_ONCE(msk->pm.addr_signal, rm_addr); ret = true; out_unlock: spin_unlock_bh(&msk->pm.lock); return ret; } int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { return mptcp_pm_nl_get_local_id(msk, skc); } void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp); /* keep track of rtx periods with no progress */ if (!subflow->stale_count) { subflow->stale_rcv_tstamp = rcv_tstamp; subflow->stale_count++; } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { if (subflow->stale_count < U8_MAX) subflow->stale_count++; mptcp_pm_nl_subflow_chk_stale(msk, ssk); } else { subflow->stale_count = 0; mptcp_subflow_set_active(subflow); } } void mptcp_pm_data_init(struct mptcp_sock *msk) { msk->pm.add_addr_signaled = 0; msk->pm.add_addr_accepted = 0; msk->pm.local_addr_used = 0; msk->pm.subflows = 0; msk->pm.rm_list_tx.nr = 0; msk->pm.rm_list_rx.nr = 0; WRITE_ONCE(msk->pm.work_pending, false); WRITE_ONCE(msk->pm.addr_signal, 0); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); WRITE_ONCE(msk->pm.remote_deny_join_id0, false); msk->pm.status = 0; spin_lock_init(&msk->pm.lock); INIT_LIST_HEAD(&msk->pm.anno_list); mptcp_pm_nl_data_init(msk); } void __init mptcp_pm_init(void) { mptcp_pm_nl_init(); }
63 63 63 6 63 64 61 64 11 62 1 1 62 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * The base lksctp header. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Ryan Layer <rmlayer@us.ibm.com> * Kevin Gao <kevin.gao@intel.com> */ #ifndef __net_sctp_h__ #define __net_sctp_h__ /* Header Strategy. * Start getting some control over the header file depencies: * includes * constants * structs * prototypes * macros, externs, and inlines * * Move test_frame specific items out of the kernel headers * and into the test frame headers. This is not perfect in any sense * and will continue to evolve. */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/tty.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/idr.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_route.h> #endif #include <linux/uaccess.h> #include <asm/page.h> #include <net/sock.h> #include <net/snmp.h> #include <net/sctp/structs.h> #include <net/sctp/constants.h> #ifdef CONFIG_IP_SCTP_MODULE #define SCTP_PROTOSW_FLAG 0 #else /* static! */ #define SCTP_PROTOSW_FLAG INET_PROTOSW_PERMANENT #endif /* Round an int up to the next multiple of 4. */ #define SCTP_PAD4(s) (((s)+3)&~3) /* Truncate to the previous multiple of 4. */ #define SCTP_TRUNC4(s) ((s)&~3) /* * Function declarations. */ /* * sctp/protocol.c */ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *addr, enum sctp_scope, gfp_t gfp, int flags); struct sctp_pf *sctp_get_pf_specific(sa_family_t family); int sctp_register_pf(struct sctp_pf *, sa_family_t); void sctp_addr_wq_mgmt(struct net *, struct sctp_sockaddr_entry *, int); int sctp_udp_sock_start(struct net *net); void sctp_udp_sock_stop(struct net *net); /* * sctp/socket.c */ int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); void sctp_data_ready(struct sock *sk); __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc); extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); typedef int (*sctp_callback_t)(struct sctp_endpoint *, struct sctp_transport *, void *); void sctp_transport_walk_start(struct rhashtable_iter *iter); void sctp_transport_walk_stop(struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_next(struct net *net, struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_idx(struct net *net, struct rhashtable_iter *iter, int pos); int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, void *p); int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, struct net *net, int *pos, void *p); int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, struct sctp_info *info); /* * sctp/primitive.c */ int sctp_primitive_ASSOCIATE(struct net *, struct sctp_association *, void *arg); int sctp_primitive_SHUTDOWN(struct net *, struct sctp_association *, void *arg); int sctp_primitive_ABORT(struct net *, struct sctp_association *, void *arg); int sctp_primitive_SEND(struct net *, struct sctp_association *, void *arg); int sctp_primitive_REQUESTHEARTBEAT(struct net *, struct sctp_association *, void *arg); int sctp_primitive_ASCONF(struct net *, struct sctp_association *, void *arg); int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc, void *arg); /* * sctp/input.c */ int sctp_rcv(struct sk_buff *skb); int sctp_v4_err(struct sk_buff *skb, u32 info); int sctp_hash_endpoint(struct sctp_endpoint *ep); void sctp_unhash_endpoint(struct sctp_endpoint *); struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *, struct sctphdr *, struct sctp_association **, struct sctp_transport **); void sctp_err_finish(struct sock *, struct sctp_transport *); int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb); int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb); void sctp_icmp_frag_needed(struct sock *, struct sctp_association *, struct sctp_transport *t, __u32 pmtu); void sctp_icmp_redirect(struct sock *, struct sctp_transport *, struct sk_buff *); void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t); void sctp_backlog_migrate(struct sctp_association *assoc, struct sock *oldsk, struct sock *newsk); int sctp_transport_hashtable_init(void); void sctp_transport_hashtable_destroy(void); int sctp_hash_transport(struct sctp_transport *t); void sctp_unhash_transport(struct sctp_transport *t); struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr); struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr); /* * sctp/proc.c */ int __net_init sctp_proc_init(struct net *net); /* * sctp/offload.c */ int sctp_offload_init(void); /* * sctp/stream_sched.c */ void sctp_sched_ops_init(void); /* * sctp/stream.c */ int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params); int sctp_send_reset_assoc(struct sctp_association *asoc); int sctp_send_add_streams(struct sctp_association *asoc, struct sctp_add_streams *params); /* * Module global variables */ /* * sctp/protocol.c */ extern struct kmem_cache *sctp_chunk_cachep __read_mostly; extern struct kmem_cache *sctp_bucket_cachep __read_mostly; extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; /* * Section: Macros, externs, and inlines */ /* SCTP SNMP MIB stats handlers */ #define SCTP_INC_STATS(net, field) SNMP_INC_STATS((net)->sctp.sctp_statistics, field) #define __SCTP_INC_STATS(net, field) __SNMP_INC_STATS((net)->sctp.sctp_statistics, field) #define SCTP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->sctp.sctp_statistics, field) /* sctp mib definitions */ enum { SCTP_MIB_NUM = 0, SCTP_MIB_CURRESTAB, /* CurrEstab */ SCTP_MIB_ACTIVEESTABS, /* ActiveEstabs */ SCTP_MIB_PASSIVEESTABS, /* PassiveEstabs */ SCTP_MIB_ABORTEDS, /* Aborteds */ SCTP_MIB_SHUTDOWNS, /* Shutdowns */ SCTP_MIB_OUTOFBLUES, /* OutOfBlues */ SCTP_MIB_CHECKSUMERRORS, /* ChecksumErrors */ SCTP_MIB_OUTCTRLCHUNKS, /* OutCtrlChunks */ SCTP_MIB_OUTORDERCHUNKS, /* OutOrderChunks */ SCTP_MIB_OUTUNORDERCHUNKS, /* OutUnorderChunks */ SCTP_MIB_INCTRLCHUNKS, /* InCtrlChunks */ SCTP_MIB_INORDERCHUNKS, /* InOrderChunks */ SCTP_MIB_INUNORDERCHUNKS, /* InUnorderChunks */ SCTP_MIB_FRAGUSRMSGS, /* FragUsrMsgs */ SCTP_MIB_REASMUSRMSGS, /* ReasmUsrMsgs */ SCTP_MIB_OUTSCTPPACKS, /* OutSCTPPacks */ SCTP_MIB_INSCTPPACKS, /* InSCTPPacks */ SCTP_MIB_T1_INIT_EXPIREDS, SCTP_MIB_T1_COOKIE_EXPIREDS, SCTP_MIB_T2_SHUTDOWN_EXPIREDS, SCTP_MIB_T3_RTX_EXPIREDS, SCTP_MIB_T4_RTO_EXPIREDS, SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS, SCTP_MIB_DELAY_SACK_EXPIREDS, SCTP_MIB_AUTOCLOSE_EXPIREDS, SCTP_MIB_T1_RETRANSMITS, SCTP_MIB_T3_RETRANSMITS, SCTP_MIB_PMTUD_RETRANSMITS, SCTP_MIB_FAST_RETRANSMITS, SCTP_MIB_IN_PKT_SOFTIRQ, SCTP_MIB_IN_PKT_BACKLOG, SCTP_MIB_IN_PKT_DISCARDS, SCTP_MIB_IN_DATA_CHUNK_DISCARDS, __SCTP_MIB_MAX }; #define SCTP_MIB_MAX __SCTP_MIB_MAX struct sctp_mib { unsigned long mibs[SCTP_MIB_MAX]; }; /* helper function to track stats about max rto and related transport */ static inline void sctp_max_rto(struct sctp_association *asoc, struct sctp_transport *trans) { if (asoc->stats.max_obs_rto < (__u64)trans->rto) { asoc->stats.max_obs_rto = trans->rto; memset(&asoc->stats.obs_rto_ipaddr, 0, sizeof(struct sockaddr_storage)); memcpy(&asoc->stats.obs_rto_ipaddr, &trans->ipaddr, trans->af_specific->sockaddr_len); } } /* * Macros for keeping a global reference of object allocations. */ #ifdef CONFIG_SCTP_DBG_OBJCNT extern atomic_t sctp_dbg_objcnt_sock; extern atomic_t sctp_dbg_objcnt_ep; extern atomic_t sctp_dbg_objcnt_assoc; extern atomic_t sctp_dbg_objcnt_transport; extern atomic_t sctp_dbg_objcnt_chunk; extern atomic_t sctp_dbg_objcnt_bind_addr; extern atomic_t sctp_dbg_objcnt_bind_bucket; extern atomic_t sctp_dbg_objcnt_addr; extern atomic_t sctp_dbg_objcnt_datamsg; extern atomic_t sctp_dbg_objcnt_keys; /* Macros to atomically increment/decrement objcnt counters. */ #define SCTP_DBG_OBJCNT_INC(name) \ atomic_inc(&sctp_dbg_objcnt_## name) #define SCTP_DBG_OBJCNT_DEC(name) \ atomic_dec(&sctp_dbg_objcnt_## name) #define SCTP_DBG_OBJCNT(name) \ atomic_t sctp_dbg_objcnt_## name = ATOMIC_INIT(0) /* Macro to help create new entries in the global array of * objcnt counters. */ #define SCTP_DBG_OBJCNT_ENTRY(name) \ {.label= #name, .counter= &sctp_dbg_objcnt_## name} void sctp_dbg_objcnt_init(struct net *); #else #define SCTP_DBG_OBJCNT_INC(name) #define SCTP_DBG_OBJCNT_DEC(name) static inline void sctp_dbg_objcnt_init(struct net *net) { return; } #endif /* CONFIG_SCTP_DBG_OBJCOUNT */ #if defined CONFIG_SYSCTL void sctp_sysctl_register(void); void sctp_sysctl_unregister(void); int sctp_sysctl_net_register(struct net *net); void sctp_sysctl_net_unregister(struct net *net); #else static inline void sctp_sysctl_register(void) { return; } static inline void sctp_sysctl_unregister(void) { return; } static inline int sctp_sysctl_net_register(struct net *net) { return 0; } static inline void sctp_sysctl_net_unregister(struct net *net) { return; } #endif /* Size of Supported Address Parameter for 'x' address types. */ #define SCTP_SAT_LEN(x) (sizeof(struct sctp_paramhdr) + (x) * sizeof(__u16)) #if IS_ENABLED(CONFIG_IPV6) void sctp_v6_pf_init(void); void sctp_v6_pf_exit(void); int sctp_v6_protosw_init(void); void sctp_v6_protosw_exit(void); int sctp_v6_add_protocol(void); void sctp_v6_del_protocol(void); #else /* #ifdef defined(CONFIG_IPV6) */ static inline void sctp_v6_pf_init(void) { return; } static inline void sctp_v6_pf_exit(void) { return; } static inline int sctp_v6_protosw_init(void) { return 0; } static inline void sctp_v6_protosw_exit(void) { return; } static inline int sctp_v6_add_protocol(void) { return 0; } static inline void sctp_v6_del_protocol(void) { return; } #endif /* #if defined(CONFIG_IPV6) */ /* Map an association to an assoc_id. */ static inline sctp_assoc_t sctp_assoc2id(const struct sctp_association *asoc) { return asoc ? asoc->assoc_id : 0; } static inline enum sctp_sstat_state sctp_assoc_to_state(const struct sctp_association *asoc) { /* SCTP's uapi always had SCTP_EMPTY(=0) as a dummy state, but we * got rid of it in kernel space. Therefore SCTP_CLOSED et al * start at =1 in user space, but actually as =0 in kernel space. * Now that we can not break user space and SCTP_EMPTY is exposed * there, we need to fix it up with an ugly offset not to break * applications. :( */ return asoc->state + 1; } /* Look up the association by its id. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id); int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp); /* A macro to walk a list of skbs. */ #define sctp_skb_for_each(pos, head, tmp) \ skb_queue_walk_safe(head, pos, tmp) /** * sctp_list_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. The head item is * returned or %NULL if the list is empty. */ static inline struct list_head *sctp_list_dequeue(struct list_head *list) { struct list_head *result = NULL; if (!list_empty(list)) { result = list->next; list_del_init(result); } return result; } /* SCTP version of skb_set_owner_r. We need this one because * of the way we have to do receive buffer accounting on bundled * chunks. */ static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { struct sctp_ulpevent *event = sctp_skb2event(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = sctp_sock_rfree; atomic_add(event->rmem_len, &sk->sk_rmem_alloc); /* * This mimics the behavior of skb_set_owner_r */ sk_mem_charge(sk, event->rmem_len); } /* Tests if the list has one and only one entry. */ static inline int sctp_list_single_entry(struct list_head *head) { return list_is_singular(head); } static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk) { return !list_empty(&chunk->list); } /* Walk through a list of TLV parameters. Don't trust the * individual parameter lengths and instead depend on * the chunk length to indicate when to stop. Make sure * there is room for a param header too. */ #define sctp_walk_params(pos, chunk, member)\ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member) #define _sctp_walk_params(pos, chunk, end, member)\ for (pos.v = chunk->member;\ (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\ (void *)chunk + end) &&\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ ntohs(pos.p->length) >= sizeof(struct sctp_paramhdr);\ pos.v += SCTP_PAD4(ntohs(pos.p->length))) #define sctp_walk_errors(err, chunk_hdr)\ _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length)) #define _sctp_walk_errors(err, chunk_hdr, end)\ for (err = (struct sctp_errhdr *)((void *)chunk_hdr + \ sizeof(struct sctp_chunkhdr));\ ((void *)err + offsetof(struct sctp_errhdr, length) + sizeof(err->length) <=\ (void *)chunk_hdr + end) &&\ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ ntohs(err->length) >= sizeof(struct sctp_errhdr); \ err = (struct sctp_errhdr *)((void *)err + SCTP_PAD4(ntohs(err->length)))) #define sctp_walk_fwdtsn(pos, chunk)\ _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk)) #define _sctp_walk_fwdtsn(pos, chunk, end)\ for (pos = chunk->subh.fwdtsn_hdr->skip;\ (void *)pos <= (void *)chunk->subh.fwdtsn_hdr->skip + end - sizeof(struct sctp_fwdtsn_skip);\ pos++) /* External references. */ extern struct proto sctp_prot; extern struct proto sctpv6_prot; void sctp_put_port(struct sock *sk); extern struct idr sctp_assocs_id; extern spinlock_t sctp_assocs_id_lock; /* Static inline functions. */ /* Convert from an IP version number to an Address Family symbol. */ static inline int ipver2af(__u8 ipver) { switch (ipver) { case 4: return AF_INET; case 6: return AF_INET6; default: return 0; } } /* Convert from an address parameter type to an address family. */ static inline int param_type2af(__be16 type) { switch (type) { case SCTP_PARAM_IPV4_ADDRESS: return AF_INET; case SCTP_PARAM_IPV6_ADDRESS: return AF_INET6; default: return 0; } } /* Warning: The following hash functions assume a power of two 'size'. */ /* This is the hash function for the SCTP port hash table. */ static inline int sctp_phashfn(struct net *net, __u16 lport) { return (net_hash_mix(net) + lport) & (sctp_port_hashsize - 1); } /* This is the hash function for the endpoint hash table. */ static inline int sctp_ep_hashfn(struct net *net, __u16 lport) { return (net_hash_mix(net) + lport) & (sctp_ep_hashsize - 1); } #define sctp_for_each_hentry(epb, head) \ hlist_for_each_entry(epb, head, node) /* Is a socket of this style? */ #define sctp_style(sk, style) __sctp_style((sk), (SCTP_SOCKET_##style)) static inline int __sctp_style(const struct sock *sk, enum sctp_socket_type style) { return sctp_sk(sk)->type == style; } /* Is the association in this state? */ #define sctp_state(asoc, state) __sctp_state((asoc), (SCTP_STATE_##state)) static inline int __sctp_state(const struct sctp_association *asoc, enum sctp_state state) { return asoc->state == state; } /* Is the socket in this state? */ #define sctp_sstate(sk, state) __sctp_sstate((sk), (SCTP_SS_##state)) static inline int __sctp_sstate(const struct sock *sk, enum sctp_sock_state state) { return sk->sk_state == state; } /* Map v4-mapped v6 address back to v4 address */ static inline void sctp_v6_map_v4(union sctp_addr *addr) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = addr->v6.sin6_port; addr->v4.sin_addr.s_addr = addr->v6.sin6_addr.s6_addr32[3]; } /* Map v4 address to v4-mapped v6 address */ static inline void sctp_v4_map_v6(union sctp_addr *addr) { __be16 port; port = addr->v4.sin_port; addr->v6.sin6_addr.s6_addr32[3] = addr->v4.sin_addr.s_addr; addr->v6.sin6_port = port; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_scope_id = 0; addr->v6.sin6_addr.s6_addr32[0] = 0; addr->v6.sin6_addr.s6_addr32[1] = 0; addr->v6.sin6_addr.s6_addr32[2] = htonl(0x0000ffff); } /* The cookie is always 0 since this is how it's used in the * pmtu code. */ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t) { if (t->dst && !dst_check(t->dst, t->dst_cookie)) sctp_transport_dst_release(t); return t->dst; } /* Calculate max payload size given a MTU, or the total overhead if * given MTU is zero */ static inline __u32 __sctp_mtu_payload(const struct sctp_sock *sp, const struct sctp_transport *t, __u32 mtu, __u32 extra) { __u32 overhead = sizeof(struct sctphdr) + extra; if (sp) { overhead += sp->pf->af->net_header_len; if (sp->udp_port && (!t || t->encap_port)) overhead += sizeof(struct udphdr); } else { overhead += sizeof(struct ipv6hdr); } if (WARN_ON_ONCE(mtu && mtu <= overhead)) mtu = overhead; return mtu ? mtu - overhead : overhead; } static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp, __u32 mtu, __u32 extra) { return __sctp_mtu_payload(sp, NULL, mtu, extra); } static inline __u32 sctp_dst_mtu(const struct dst_entry *dst) { return SCTP_TRUNC4(max_t(__u32, dst_mtu(dst), SCTP_DEFAULT_MINSEGMENT)); } static inline bool sctp_transport_pmtu_check(struct sctp_transport *t) { __u32 pmtu = sctp_dst_mtu(t->dst); if (t->pathmtu == pmtu) return true; t->pathmtu = pmtu; return false; } static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize) { return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize); } static inline int sctp_transport_pl_hlen(struct sctp_transport *t) { return __sctp_mtu_payload(sctp_sk(t->asoc->base.sk), t, 0, 0) - sizeof(struct sctphdr); } static inline void sctp_transport_pl_reset(struct sctp_transport *t) { if (t->probe_interval && (t->param_flags & SPP_PMTUD_ENABLE) && (t->state == SCTP_ACTIVE || t->state == SCTP_UNKNOWN)) { if (t->pl.state == SCTP_PL_DISABLED) { t->pl.state = SCTP_PL_BASE; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pl.probe_size = SCTP_BASE_PLPMTU; sctp_transport_reset_probe_timer(t); } } else { if (t->pl.state != SCTP_PL_DISABLED) { if (del_timer(&t->probe_timer)) sctp_transport_put(t); t->pl.state = SCTP_PL_DISABLED; } } } static inline void sctp_transport_pl_update(struct sctp_transport *t) { if (t->pl.state == SCTP_PL_DISABLED) return; t->pl.state = SCTP_PL_BASE; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pl.probe_size = SCTP_BASE_PLPMTU; sctp_transport_reset_probe_timer(t); } static inline bool sctp_transport_pl_enabled(struct sctp_transport *t) { return t->pl.state != SCTP_PL_DISABLED; } static inline bool sctp_newsk_ready(const struct sock *sk) { return sock_flag(sk, SOCK_DEAD) || sk->sk_socket; } static inline void sctp_sock_set_nodelay(struct sock *sk) { lock_sock(sk); sctp_sk(sk)->nodelay = true; release_sock(sk); } #endif /* __net_sctp_h__ */
2650 1313 1314 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/bug.h> #include <linux/atomic.h> #include <linux/errseq.h> #include <linux/log2.h> /* * An errseq_t is a way of recording errors in one place, and allowing any * number of "subscribers" to tell whether it has changed since a previous * point where it was sampled. * * It's implemented as an unsigned 32-bit value. The low order bits are * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits * are used as a counter. This is done with atomics instead of locking so that * these functions can be called from any context. * * The general idea is for consumers to sample an errseq_t value. That value * can later be used to tell whether any new errors have occurred since that * sampling was done. * * Note that there is a risk of collisions if new errors are being recorded * frequently, since we have so few bits to use as a counter. * * To mitigate this, one bit is used as a flag to tell whether the value has * been sampled since a new value was recorded. That allows us to avoid bumping * the counter if no one has sampled it since the last time an error was * recorded. * * A new errseq_t should always be zeroed out. A errseq_t value of all zeroes * is the special (but common) case where there has never been an error. An all * zero value thus serves as the "epoch" if one wishes to know whether there * has ever been an error set since it was first initialized. */ /* The low bits are designated for error code (max of MAX_ERRNO) */ #define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) /** * errseq_set - set a errseq_t for later reporting * @eseq: errseq_t field that should be set * @err: error to set (must be between -1 and -MAX_ERRNO) * * This function sets the error in @eseq, and increments the sequence counter * if the last sequence was sampled at some point in the past. * * Any error set will always overwrite an existing error. * * Return: The previous value, primarily for debugging purposes. The * return value should not be used as a previously sampled value in later * calls as it will not have the SEEN flag set. */ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; /* MAX_ERRNO must be able to serve as a mask */ BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it * doesn't then just throw a warning and don't record anything. We * also don't accept zero here as that would effectively clear a * previous error. */ old = READ_ONCE(*eseq); if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO), "err = %d\n", err)) return old; for (;;) { errseq_t new; /* Clear out error bits and set new error */ new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) new += ERRSEQ_CTR_INC; /* If there would be no change, then call it done */ if (new == old) { cur = new; break; } /* Try to swap the new value into place */ cur = cmpxchg(eseq, old, new); /* * Call it success if we did the swap or someone else beat us * to it for the same value. */ if (likely(cur == old || cur == new)) break; /* Raced with an update, try again */ old = cur; } return cur; } EXPORT_SYMBOL(errseq_set); /** * errseq_sample() - Grab current errseq_t value. * @eseq: Pointer to errseq_t to be sampled. * * This function allows callers to initialise their errseq_t variable. * If the error has been "seen", new callers will not see an old error. * If there is an unseen error in @eseq, the caller of this function will * see it the next time it checks for an error. * * Context: Any context. * Return: The current errseq value. */ errseq_t errseq_sample(errseq_t *eseq) { errseq_t old = READ_ONCE(*eseq); /* If nobody has seen this error yet, then we can be the first. */ if (!(old & ERRSEQ_SEEN)) old = 0; return old; } EXPORT_SYMBOL(errseq_sample); /** * errseq_check() - Has an error occurred since a particular sample point? * @eseq: Pointer to errseq_t value to be checked. * @since: Previously-sampled errseq_t from which to check. * * Grab the value that eseq points to, and see if it has changed @since * the given value was sampled. The @since value is not advanced, so there * is no need to mark the value as seen. * * Return: The latest error set in the errseq_t or 0 if it hasn't changed. */ int errseq_check(errseq_t *eseq, errseq_t since) { errseq_t cur = READ_ONCE(*eseq); if (likely(cur == since)) return 0; return -(cur & MAX_ERRNO); } EXPORT_SYMBOL(errseq_check); /** * errseq_check_and_advance() - Check an errseq_t and advance to current value. * @eseq: Pointer to value being checked and reported. * @since: Pointer to previously-sampled errseq_t to check against and advance. * * Grab the eseq value, and see whether it matches the value that @since * points to. If it does, then just return 0. * * If it doesn't, then the value has changed. Set the "seen" flag, and try to * swap it into place as the new eseq value. Then, set that value as the new * "since" value, and return whatever the error portion is set to. * * Note that no locking is provided here for concurrent updates to the "since" * value. The caller must provide that if necessary. Because of this, callers * may want to do a lockless errseq_check before taking the lock and calling * this. * * Return: Negative errno if one has been stored, or 0 if no new error has * occurred. */ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) { int err = 0; errseq_t old, new; /* * Most callers will want to use the inline wrapper to check this, * so that the common case of no error is handled without needing * to take the lock that protects the "since" value. */ old = READ_ONCE(*eseq); if (old != *since) { /* * Set the flag and try to swap it into place if it has * changed. * * We don't care about the outcome of the swap here. If the * swap doesn't occur, then it has either been updated by a * writer who is altering the value in some way (updating * counter or resetting the error), or another reader who is * just setting the "seen" flag. Either outcome is OK, and we * can advance "since" and return an error based on what we * have. */ new = old | ERRSEQ_SEEN; if (new != old) cmpxchg(eseq, old, new); *since = new; err = -(new & MAX_ERRNO); } return err; } EXPORT_SYMBOL(errseq_check_and_advance);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ #include "translation-table.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/crc32c.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "soft-interface.h" #include "tvlv.h" static struct kmem_cache *batadv_tl_cache __read_mostly; static struct kmem_cache *batadv_tg_cache __read_mostly; static struct kmem_cache *batadv_tt_orig_cache __read_mostly; static struct kmem_cache *batadv_tt_change_cache __read_mostly; static struct kmem_cache *batadv_tt_req_cache __read_mostly; static struct kmem_cache *batadv_tt_roam_cache __read_mostly; /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; static struct lock_class_key batadv_tt_global_hash_lock_class_key; static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node); static void batadv_tt_purge(struct work_struct *work); static void batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry); static void batadv_tt_global_del(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid, const char *message, bool roaming); /** * batadv_compare_tt() - check if two TT entries are the same * @node: the list element pointer of the first TT entry * @data2: pointer to the tt_common_entry of the second TT entry * * Compare the MAC address and the VLAN ID of the two TT entries and check if * they are the same TT client. * Return: true if the two TT clients are the same, false otherwise */ static bool batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, hash_entry); const struct batadv_tt_common_entry *tt1 = data1; const struct batadv_tt_common_entry *tt2 = data2; return (tt1->vid == tt2->vid) && batadv_compare_eth(data1, data2); } /** * batadv_choose_tt() - return the index of the tt entry in the hash table * @data: pointer to the tt_common_entry object to map * @size: the size of the hash table * * Return: the hash index where the object represented by 'data' should be * stored at. */ static inline u32 batadv_choose_tt(const void *data, u32 size) { struct batadv_tt_common_entry *tt; u32 hash = 0; tt = (struct batadv_tt_common_entry *)data; hash = jhash(&tt->addr, ETH_ALEN, hash); hash = jhash(&tt->vid, sizeof(tt->vid), hash); return hash % size; } /** * batadv_tt_hash_find() - look for a client in the given hash table * @hash: the hash table to search * @addr: the mac address of the client to look for * @vid: VLAN identifier * * Return: a pointer to the tt_common struct belonging to the searched client if * found, NULL otherwise. */ static struct batadv_tt_common_entry * batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, unsigned short vid) { struct hlist_head *head; struct batadv_tt_common_entry to_search, *tt, *tt_tmp = NULL; u32 index; if (!hash) return NULL; ether_addr_copy(to_search.addr, addr); to_search.vid = vid; index = batadv_choose_tt(&to_search, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(tt, head, hash_entry) { if (!batadv_compare_eth(tt, addr)) continue; if (tt->vid != vid) continue; if (!kref_get_unless_zero(&tt->refcount)) continue; tt_tmp = tt; break; } rcu_read_unlock(); return tt_tmp; } /** * batadv_tt_local_hash_find() - search the local table for a given client * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier * * Return: a pointer to the corresponding tt_local_entry struct if the client is * found, NULL otherwise. */ static struct batadv_tt_local_entry * batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local_entry = NULL; tt_common_entry = batadv_tt_hash_find(bat_priv->tt.local_hash, addr, vid); if (tt_common_entry) tt_local_entry = container_of(tt_common_entry, struct batadv_tt_local_entry, common); return tt_local_entry; } /** * batadv_tt_global_hash_find() - search the global table for a given client * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier * * Return: a pointer to the corresponding tt_global_entry struct if the client * is found, NULL otherwise. */ struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_global_entry *tt_global_entry = NULL; tt_common_entry = batadv_tt_hash_find(bat_priv->tt.global_hash, addr, vid); if (tt_common_entry) tt_global_entry = container_of(tt_common_entry, struct batadv_tt_global_entry, common); return tt_global_entry; } /** * batadv_tt_local_entry_free_rcu() - free the tt_local_entry * @rcu: rcu pointer of the tt_local_entry */ static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu) { struct batadv_tt_local_entry *tt_local_entry; tt_local_entry = container_of(rcu, struct batadv_tt_local_entry, common.rcu); kmem_cache_free(batadv_tl_cache, tt_local_entry); } /** * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period * @ref: kref pointer of the nc_node */ static void batadv_tt_local_entry_release(struct kref *ref) { struct batadv_tt_local_entry *tt_local_entry; tt_local_entry = container_of(ref, struct batadv_tt_local_entry, common.refcount); batadv_softif_vlan_put(tt_local_entry->vlan); call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu); } /** * batadv_tt_local_entry_put() - decrement the tt_local_entry refcounter and * possibly release it * @tt_local_entry: tt_local_entry to be free'd */ static void batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) { if (!tt_local_entry) return; kref_put(&tt_local_entry->common.refcount, batadv_tt_local_entry_release); } /** * batadv_tt_global_entry_free_rcu() - free the tt_global_entry * @rcu: rcu pointer of the tt_global_entry */ static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) { struct batadv_tt_global_entry *tt_global_entry; tt_global_entry = container_of(rcu, struct batadv_tt_global_entry, common.rcu); kmem_cache_free(batadv_tg_cache, tt_global_entry); } /** * batadv_tt_global_entry_release() - release tt_global_entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the nc_node */ void batadv_tt_global_entry_release(struct kref *ref) { struct batadv_tt_global_entry *tt_global_entry; tt_global_entry = container_of(ref, struct batadv_tt_global_entry, common.refcount); batadv_tt_global_del_orig_list(tt_global_entry); call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu); } /** * batadv_tt_global_hash_count() - count the number of orig entries * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to count entries for * @vid: VLAN identifier * * Return: the number of originators advertising the given address/data * (excluding our self). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; int count; tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) return 0; count = atomic_read(&tt_global_entry->orig_list_count); batadv_tt_global_entry_put(tt_global_entry); return count; } /** * batadv_tt_local_size_mod() - change the size by v of the local table * identified by vid * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier of the sub-table to change * @v: the amount to sum to the local table size */ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv, unsigned short vid, int v) { struct batadv_softif_vlan *vlan; vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return; atomic_add(v, &vlan->tt.num_entries); batadv_softif_vlan_put(vlan); } /** * batadv_tt_local_size_inc() - increase by one the local table size for the * given vid * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier */ static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv, unsigned short vid) { batadv_tt_local_size_mod(bat_priv, vid, 1); } /** * batadv_tt_local_size_dec() - decrease by one the local table size for the * given vid * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier */ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv, unsigned short vid) { batadv_tt_local_size_mod(bat_priv, vid, -1); } /** * batadv_tt_global_size_mod() - change the size by v of the global table * for orig_node identified by vid * @orig_node: the originator for which the table has to be modified * @vid: the VLAN identifier * @v: the amount to sum to the global table size */ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, unsigned short vid, int v) { struct batadv_orig_node_vlan *vlan; vlan = batadv_orig_node_vlan_new(orig_node, vid); if (!vlan) return; if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); if (!hlist_unhashed(&vlan->list)) { hlist_del_init_rcu(&vlan->list); batadv_orig_node_vlan_put(vlan); } spin_unlock_bh(&orig_node->vlan_list_lock); } batadv_orig_node_vlan_put(vlan); } /** * batadv_tt_global_size_inc() - increase by one the global table size for the * given vid * @orig_node: the originator which global table size has to be decreased * @vid: the vlan identifier */ static void batadv_tt_global_size_inc(struct batadv_orig_node *orig_node, unsigned short vid) { batadv_tt_global_size_mod(orig_node, vid, 1); } /** * batadv_tt_global_size_dec() - decrease by one the global table size for the * given vid * @orig_node: the originator which global table size has to be decreased * @vid: the vlan identifier */ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, unsigned short vid) { batadv_tt_global_size_mod(orig_node, vid, -1); } /** * batadv_tt_orig_list_entry_free_rcu() - free the orig_entry * @rcu: rcu pointer of the orig_entry */ static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) { struct batadv_tt_orig_list_entry *orig_entry; orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); kmem_cache_free(batadv_tt_orig_cache, orig_entry); } /** * batadv_tt_orig_list_entry_release() - release tt orig entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the tt orig entry */ static void batadv_tt_orig_list_entry_release(struct kref *ref) { struct batadv_tt_orig_list_entry *orig_entry; orig_entry = container_of(ref, struct batadv_tt_orig_list_entry, refcount); batadv_orig_node_put(orig_entry->orig_node); call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); } /** * batadv_tt_orig_list_entry_put() - decrement the tt orig entry refcounter and * possibly release it * @orig_entry: tt orig entry to be free'd */ static void batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) { if (!orig_entry) return; kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release); } /** * batadv_tt_local_event() - store a local TT event (ADD/DEL) * @bat_priv: the bat priv with all the soft interface information * @tt_local_entry: the TT entry involved in the event * @event_flags: flags to store in the event structure */ static void batadv_tt_local_event(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, u8 event_flags) { struct batadv_tt_change_node *tt_change_node, *entry, *safe; struct batadv_tt_common_entry *common = &tt_local_entry->common; u8 flags = common->flags | event_flags; bool event_removed = false; bool del_op_requested, del_op_entry; tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC); if (!tt_change_node) return; tt_change_node->change.flags = flags; memset(tt_change_node->change.reserved, 0, sizeof(tt_change_node->change.reserved)); ether_addr_copy(tt_change_node->change.addr, common->addr); tt_change_node->change.vid = htons(common->vid); del_op_requested = flags & BATADV_TT_CLIENT_DEL; /* check for ADD+DEL or DEL+ADD events */ spin_lock_bh(&bat_priv->tt.changes_list_lock); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { if (!batadv_compare_eth(entry->change.addr, common->addr)) continue; /* DEL+ADD in the same orig interval have no effect and can be * removed to avoid silly behaviour on the receiver side. The * other way around (ADD+DEL) can happen in case of roaming of * a client still in the NEW state. Roaming of NEW clients is * now possible due to automatically recognition of "temporary" * clients */ del_op_entry = entry->change.flags & BATADV_TT_CLIENT_DEL; if (!del_op_requested && del_op_entry) goto del; if (del_op_requested && !del_op_entry) goto del; /* this is a second add in the same originator interval. It * means that flags have been changed: update them! */ if (!del_op_requested && !del_op_entry) entry->change.flags = flags; continue; del: list_del(&entry->list); kmem_cache_free(batadv_tt_change_cache, entry); kmem_cache_free(batadv_tt_change_cache, tt_change_node); event_removed = true; goto unlock; } /* track the change in the OGMinterval list */ list_add_tail(&tt_change_node->list, &bat_priv->tt.changes_list); unlock: spin_unlock_bh(&bat_priv->tt.changes_list_lock); if (event_removed) atomic_dec(&bat_priv->tt.local_changes); else atomic_inc(&bat_priv->tt.local_changes); } /** * batadv_tt_len() - compute length in bytes of given number of tt changes * @changes_num: number of tt changes * * Return: computed length in bytes. */ static int batadv_tt_len(int changes_num) { return changes_num * sizeof(struct batadv_tvlv_tt_change); } /** * batadv_tt_entries() - compute the number of entries fitting in tt_len bytes * @tt_len: available space * * Return: the number of entries. */ static u16 batadv_tt_entries(u16 tt_len) { return tt_len / batadv_tt_len(1); } /** * batadv_tt_local_table_transmit_size() - calculates the local translation * table size when transmitted over the air * @bat_priv: the bat priv with all the soft interface information * * Return: local translation table size in bytes. */ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { u16 num_vlan = 0; u16 tt_local_entries = 0; struct batadv_softif_vlan *vlan; int hdr_size; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { num_vlan++; tt_local_entries += atomic_read(&vlan->tt.num_entries); } rcu_read_unlock(); /* header size of tvlv encapsulated tt response payload */ hdr_size = sizeof(struct batadv_unicast_tvlv_packet); hdr_size += sizeof(struct batadv_tvlv_hdr); hdr_size += sizeof(struct batadv_tvlv_tt_data); hdr_size += num_vlan * sizeof(struct batadv_tvlv_tt_vlan_data); return hdr_size + batadv_tt_len(tt_local_entries); } static int batadv_tt_local_init(struct batadv_priv *bat_priv) { if (bat_priv->tt.local_hash) return 0; bat_priv->tt.local_hash = batadv_hash_new(1024); if (!bat_priv->tt.local_hash) return -ENOMEM; batadv_hash_set_lock_class(bat_priv->tt.local_hash, &batadv_tt_local_hash_lock_class_key); return 0; } static void batadv_tt_global_free(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global, const char *message) { struct batadv_tt_global_entry *tt_removed_entry; struct hlist_node *tt_removed_node; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(tt_global->common.vid), message); tt_removed_node = batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, &tt_global->common); if (!tt_removed_node) return; /* drop reference of remove hash entry */ tt_removed_entry = hlist_entry(tt_removed_node, struct batadv_tt_global_entry, common.hash_entry); batadv_tt_global_entry_put(tt_removed_entry); } /** * batadv_tt_local_add() - add a new client to the local table or update an * existing client * @soft_iface: netdev struct of the mesh interface * @addr: the mac address of the client to add * @vid: VLAN identifier * @ifindex: index of the interface where the client is connected to (useful to * identify wireless clients) * @mark: the value contained in the skb->mark field of the received packet (if * any) * * Return: true if the client was successfully added, false otherwise. */ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_tt_local_entry *tt_local; struct batadv_tt_global_entry *tt_global = NULL; struct net *net = dev_net(soft_iface); struct batadv_softif_vlan *vlan; struct net_device *in_dev = NULL; struct batadv_hard_iface *in_hardif = NULL; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; int hash_added, table_size, packet_size_max; bool ret = false; bool roamed_back = false; u8 remote_flags; u32 match_mark; if (ifindex != BATADV_NULL_IFINDEX) in_dev = dev_get_by_index(net, ifindex); if (in_dev) in_hardif = batadv_hardif_get_by_netdev(in_dev); tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!is_multicast_ether_addr(addr)) tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); if (tt_local) { tt_local->last_seen = jiffies; if (tt_local->common.flags & BATADV_TT_CLIENT_PENDING) { batadv_dbg(BATADV_DBG_TT, bat_priv, "Re-adding pending client %pM (vid: %d)\n", addr, batadv_print_vid(vid)); /* whatever the reason why the PENDING flag was set, * this is a client which was enqueued to be removed in * this orig_interval. Since it popped up again, the * flag can be reset like it was never enqueued */ tt_local->common.flags &= ~BATADV_TT_CLIENT_PENDING; goto add_event; } if (tt_local->common.flags & BATADV_TT_CLIENT_ROAM) { batadv_dbg(BATADV_DBG_TT, bat_priv, "Roaming client %pM (vid: %d) came back to its original location\n", addr, batadv_print_vid(vid)); /* the ROAM flag is set because this client roamed away * and the node got a roaming_advertisement message. Now * that the client popped up again at its original * location such flag can be unset */ tt_local->common.flags &= ~BATADV_TT_CLIENT_ROAM; roamed_back = true; } goto check_roaming; } /* Ignore the client if we cannot send it in a full table response. */ table_size = batadv_tt_local_table_transmit_size(bat_priv); table_size += batadv_tt_len(1); packet_size_max = atomic_read(&bat_priv->packet_size_max); if (table_size > packet_size_max) { net_ratelimited_function(batadv_info, soft_iface, "Local translation table size (%i) exceeds maximum packet size (%i); Ignoring new local tt entry: %pM\n", table_size, packet_size_max, addr); goto out; } tt_local = kmem_cache_alloc(batadv_tl_cache, GFP_ATOMIC); if (!tt_local) goto out; /* increase the refcounter of the related vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) { net_ratelimited_function(batadv_info, soft_iface, "adding TT local entry %pM to non-existent VLAN %d\n", addr, batadv_print_vid(vid)); kmem_cache_free(batadv_tl_cache, tt_local); tt_local = NULL; goto out; } batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n", addr, batadv_print_vid(vid), (u8)atomic_read(&bat_priv->tt.vn)); ether_addr_copy(tt_local->common.addr, addr); /* The local entry has to be marked as NEW to avoid to send it in * a full table response going out before the next ttvn increment * (consistency check) */ tt_local->common.flags = BATADV_TT_CLIENT_NEW; tt_local->common.vid = vid; if (batadv_is_wifi_hardif(in_hardif)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; kref_init(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; tt_local->vlan = vlan; /* the batman interface mac and multicast addresses should never be * purged */ if (batadv_compare_eth(addr, soft_iface->dev_addr) || is_multicast_ether_addr(addr)) tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE; kref_get(&tt_local->common.refcount); hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local->common, &tt_local->common.hash_entry); if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_local_entry_put(tt_local); goto out; } add_event: batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS); check_roaming: /* Check whether it is a roaming, but don't do anything if the roaming * process has already been handled */ if (tt_global && !(tt_global->common.flags & BATADV_TT_CLIENT_ROAM)) { /* These node are probably going to update their tt table */ head = &tt_global->orig_list; rcu_read_lock(); hlist_for_each_entry_rcu(orig_entry, head, list) { batadv_send_roam_adv(bat_priv, tt_global->common.addr, tt_global->common.vid, orig_entry->orig_node); } rcu_read_unlock(); if (roamed_back) { batadv_tt_global_free(bat_priv, tt_global, "Roaming canceled"); tt_global = NULL; } else { /* The global entry has to be marked as ROAMING and * has to be kept for consistency purpose */ tt_global->common.flags |= BATADV_TT_CLIENT_ROAM; tt_global->roam_at = jiffies; } } /* store the current remote flags before altering them. This helps * understanding is flags are changing or not */ remote_flags = tt_local->common.flags & BATADV_TT_REMOTE_MASK; if (batadv_is_wifi_hardif(in_hardif)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; else tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI; /* check the mark in the skb: if it's equal to the configured * isolation_mark, it means the packet is coming from an isolated * non-mesh client */ match_mark = (mark & bat_priv->isolation_mark_mask); if (bat_priv->isolation_mark_mask && match_mark == bat_priv->isolation_mark) tt_local->common.flags |= BATADV_TT_CLIENT_ISOLA; else tt_local->common.flags &= ~BATADV_TT_CLIENT_ISOLA; /* if any "dynamic" flag has been modified, resend an ADD event for this * entry so that all the nodes can get the new flags */ if (remote_flags ^ (tt_local->common.flags & BATADV_TT_REMOTE_MASK)) batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS); ret = true; out: batadv_hardif_put(in_hardif); dev_put(in_dev); batadv_tt_local_entry_put(tt_local); batadv_tt_global_entry_put(tt_global); return ret; } /** * batadv_tt_prepare_tvlv_global_data() - prepare the TVLV TT header to send * within a TT Response directed to another node * @orig_node: originator for which the TT data has to be prepared * @tt_data: uninitialised pointer to the address of the TVLV buffer * @tt_change: uninitialised pointer to the address of the area where the TT * changed can be stored * @tt_len: pointer to the length to reserve to the tt_change. if -1 this * function reserves the amount of space needed to send the entire global TT * table. In case of success the value is updated with the real amount of * reserved bytes * Allocate the needed amount of memory for the entire TT TVLV and write its * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, s32 *tt_len) { u16 num_vlan = 0; u16 num_entries = 0; u16 change_offset; u16 tvlv_len; struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_orig_node_vlan *vlan; u8 *tt_change_ptr; spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { num_vlan++; num_entries += atomic_read(&vlan->tt.num_entries); } change_offset = sizeof(**tt_data); change_offset += num_vlan * sizeof(*tt_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) *tt_len = batadv_tt_len(num_entries); tvlv_len = *tt_len; tvlv_len += change_offset; *tt_data = kmalloc(tvlv_len, GFP_ATOMIC); if (!*tt_data) { *tt_len = 0; goto out; } (*tt_data)->flags = BATADV_NO_FLAGS; (*tt_data)->ttvn = atomic_read(&orig_node->last_ttvn); (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); tt_vlan->reserved = 0; tt_vlan++; } tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: spin_unlock_bh(&orig_node->vlan_list_lock); return tvlv_len; } /** * batadv_tt_prepare_tvlv_local_data() - allocate and prepare the TT TVLV for * this node * @bat_priv: the bat priv with all the soft interface information * @tt_data: uninitialised pointer to the address of the TVLV buffer * @tt_change: uninitialised pointer to the address of the area where the TT * changes can be stored * @tt_len: pointer to the length to reserve to the tt_change. if -1 this * function reserves the amount of space needed to send the entire local TT * table. In case of success the value is updated with the real amount of * reserved bytes * * Allocate the needed amount of memory for the entire TT TVLV and write its * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN. * * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, s32 *tt_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_softif_vlan *vlan; u16 num_vlan = 0; u16 vlan_entries = 0; u16 total_entries = 0; u16 tvlv_len; u8 *tt_change_ptr; int change_offset; spin_lock_bh(&bat_priv->softif_vlan_list_lock); hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) continue; num_vlan++; total_entries += vlan_entries; } change_offset = sizeof(**tt_data); change_offset += num_vlan * sizeof(*tt_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) *tt_len = batadv_tt_len(total_entries); tvlv_len = *tt_len; tvlv_len += change_offset; *tt_data = kmalloc(tvlv_len, GFP_ATOMIC); if (!*tt_data) { tvlv_len = 0; goto out; } (*tt_data)->flags = BATADV_NO_FLAGS; (*tt_data)->ttvn = atomic_read(&bat_priv->tt.vn); (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) continue; tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); tt_vlan->reserved = 0; tt_vlan++; } tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return tvlv_len; } /** * batadv_tt_tvlv_container_update() - update the translation table tvlv * container after local tt changes have been committed * @bat_priv: the bat priv with all the soft interface information */ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) { struct batadv_tt_change_node *entry, *safe; struct batadv_tvlv_tt_data *tt_data; struct batadv_tvlv_tt_change *tt_change; int tt_diff_len, tt_change_len = 0; int tt_diff_entries_num = 0; int tt_diff_entries_count = 0; u16 tvlv_len; tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes); tt_diff_len = batadv_tt_len(tt_diff_entries_num); /* if we have too many changes for one packet don't send any * and wait for the tt table request which will be fragmented */ if (tt_diff_len > bat_priv->soft_iface->mtu) tt_diff_len = 0; tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tt_data, &tt_change, &tt_diff_len); if (!tvlv_len) return; tt_data->flags = BATADV_TT_OGM_DIFF; if (tt_diff_len == 0) goto container_register; spin_lock_bh(&bat_priv->tt.changes_list_lock); atomic_set(&bat_priv->tt.local_changes, 0); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { if (tt_diff_entries_count < tt_diff_entries_num) { memcpy(tt_change + tt_diff_entries_count, &entry->change, sizeof(struct batadv_tvlv_tt_change)); tt_diff_entries_count++; } list_del(&entry->list); kmem_cache_free(batadv_tt_change_cache, entry); } spin_unlock_bh(&bat_priv->tt.changes_list_lock); /* Keep the buffer for possible tt_request */ spin_lock_bh(&bat_priv->tt.last_changeset_lock); kfree(bat_priv->tt.last_changeset); bat_priv->tt.last_changeset_len = 0; bat_priv->tt.last_changeset = NULL; tt_change_len = batadv_tt_len(tt_diff_entries_count); /* check whether this new OGM has no changes due to size problems */ if (tt_diff_entries_count > 0) { /* if kmalloc() fails we will reply with the full table * instead of providing the diff */ bat_priv->tt.last_changeset = kzalloc(tt_diff_len, GFP_ATOMIC); if (bat_priv->tt.last_changeset) { memcpy(bat_priv->tt.last_changeset, tt_change, tt_change_len); bat_priv->tt.last_changeset_len = tt_diff_len; } } spin_unlock_bh(&bat_priv->tt.last_changeset_lock); container_register: batadv_tvlv_container_register(bat_priv, BATADV_TVLV_TT, 1, tt_data, tvlv_len); kfree(tt_data); } /** * batadv_tt_local_dump_entry() - Dump one TT local entry into a message * @msg :Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @common: tt local & tt global common data * * Return: Error code, or 0 on success */ static int batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_tt_common_entry *common) { void *hdr; struct batadv_softif_vlan *vlan; struct batadv_tt_local_entry *local; unsigned int last_seen_msecs; u32 crc; local = container_of(common, struct batadv_tt_local_entry, common); last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen); vlan = batadv_softif_vlan_get(bat_priv, common->vid); if (!vlan) return 0; crc = vlan->tt.crc; batadv_softif_vlan_put(vlan); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_TRANSTABLE_LOCAL); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) goto nla_put_failure; if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) && nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @hash: hash to dump * @bucket: bucket index to dump * @idx_s: Number of entries to skip * * Return: Error code, or 0 on success */ static int batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hashtable *hash, unsigned int bucket, int *idx_s) { struct batadv_tt_common_entry *common; int idx = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(common, &hash->table[bucket], hash_entry) { if (idx++ < *idx_s) continue; if (batadv_tt_local_dump_entry(msg, portid, cb, bat_priv, common)) { spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = idx - 1; return -EMSGSIZE; } } spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = 0; return 0; } /** * batadv_tt_local_dump() - Dump TT local entries into a message * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: Error code, or 0 on success */ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; int ret; int ifindex; int bucket = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return -EINVAL; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { ret = -ENODEV; goto out; } bat_priv = netdev_priv(soft_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } hash = bat_priv->tt.local_hash; while (bucket < hash->size) { if (batadv_tt_local_dump_bucket(msg, portid, cb, bat_priv, hash, bucket, &idx)) break; bucket++; } ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); cb->args[0] = bucket; cb->args[1] = idx; return ret; } static void batadv_tt_local_set_pending(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, u16 flags, const char *message) { batadv_tt_local_event(bat_priv, tt_local_entry, flags); /* The local client has to be marked as "pending to be removed" but has * to be kept in the table in order to send it in a full table * response issued before the net ttvn increment (consistency check) */ tt_local_entry->common.flags |= BATADV_TT_CLIENT_PENDING; batadv_dbg(BATADV_DBG_TT, bat_priv, "Local tt entry (%pM, vid: %d) pending to be removed: %s\n", tt_local_entry->common.addr, batadv_print_vid(tt_local_entry->common.vid), message); } /** * batadv_tt_local_remove() - logically remove an entry from the local table * @bat_priv: the bat priv with all the soft interface information * @addr: the MAC address of the client to remove * @vid: VLAN identifier * @message: message to append to the log on deletion * @roaming: true if the deletion is due to a roaming event * * Return: the flags assigned to the local entry before being deleted */ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming) { struct batadv_tt_local_entry *tt_removed_entry; struct batadv_tt_local_entry *tt_local_entry; u16 flags, curr_flags = BATADV_NO_FLAGS; struct hlist_node *tt_removed_node; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) goto out; curr_flags = tt_local_entry->common.flags; flags = BATADV_TT_CLIENT_DEL; /* if this global entry addition is due to a roaming, the node has to * mark the local entry as "roamed" in order to correctly reroute * packets later */ if (roaming) { flags |= BATADV_TT_CLIENT_ROAM; /* mark the local client as ROAMed */ tt_local_entry->common.flags |= BATADV_TT_CLIENT_ROAM; } if (!(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW)) { batadv_tt_local_set_pending(bat_priv, tt_local_entry, flags, message); goto out; } /* if this client has been added right now, it is possible to * immediately purge it */ batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL); tt_removed_node = batadv_hash_remove(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local_entry->common); if (!tt_removed_node) goto out; /* drop reference of remove hash entry */ tt_removed_entry = hlist_entry(tt_removed_node, struct batadv_tt_local_entry, common.hash_entry); batadv_tt_local_entry_put(tt_removed_entry); out: batadv_tt_local_entry_put(tt_local_entry); return curr_flags; } /** * batadv_tt_local_purge_list() - purge inactive tt local entries * @bat_priv: the bat priv with all the soft interface information * @head: pointer to the list containing the local tt entries * @timeout: parameter deciding whether a given tt local entry is considered * inactive or not */ static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv, struct hlist_head *head, int timeout) { struct batadv_tt_local_entry *tt_local_entry; struct batadv_tt_common_entry *tt_common_entry; struct hlist_node *node_tmp; hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { tt_local_entry = container_of(tt_common_entry, struct batadv_tt_local_entry, common); if (tt_local_entry->common.flags & BATADV_TT_CLIENT_NOPURGE) continue; /* entry already marked for deletion */ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING) continue; if (!batadv_has_timed_out(tt_local_entry->last_seen, timeout)) continue; batadv_tt_local_set_pending(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL, "timed out"); } } /** * batadv_tt_local_purge() - purge inactive tt local entries * @bat_priv: the bat priv with all the soft interface information * @timeout: parameter deciding whether a given tt local entry is considered * inactive or not */ static void batadv_tt_local_purge(struct batadv_priv *bat_priv, int timeout) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); batadv_tt_local_purge_list(bat_priv, head, timeout); spin_unlock_bh(list_lock); } } static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; if (!bat_priv->tt.local_hash) return; hash = bat_priv->tt.local_hash; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { hlist_del_rcu(&tt_common_entry->hash_entry); tt_local = container_of(tt_common_entry, struct batadv_tt_local_entry, common); batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } batadv_hash_destroy(hash); bat_priv->tt.local_hash = NULL; } static int batadv_tt_global_init(struct batadv_priv *bat_priv) { if (bat_priv->tt.global_hash) return 0; bat_priv->tt.global_hash = batadv_hash_new(1024); if (!bat_priv->tt.global_hash) return -ENOMEM; batadv_hash_set_lock_class(bat_priv->tt.global_hash, &batadv_tt_global_hash_lock_class_key); return 0; } static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_change_node *entry, *safe; spin_lock_bh(&bat_priv->tt.changes_list_lock); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { list_del(&entry->list); kmem_cache_free(batadv_tt_change_cache, entry); } atomic_set(&bat_priv->tt.local_changes, 0); spin_unlock_bh(&bat_priv->tt.changes_list_lock); } /** * batadv_tt_global_orig_entry_find() - find a TT orig_list_entry * @entry: the TT global entry where the orig_list_entry has to be * extracted from * @orig_node: the originator for which the orig_list_entry has to be found * * retrieve the orig_tt_list_entry belonging to orig_node from the * batadv_tt_global_entry list * * Return: it with an increased refcounter, NULL if not found */ static struct batadv_tt_orig_list_entry * batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, const struct batadv_orig_node *orig_node) { struct batadv_tt_orig_list_entry *tmp_orig_entry, *orig_entry = NULL; const struct hlist_head *head; rcu_read_lock(); head = &entry->orig_list; hlist_for_each_entry_rcu(tmp_orig_entry, head, list) { if (tmp_orig_entry->orig_node != orig_node) continue; if (!kref_get_unless_zero(&tmp_orig_entry->refcount)) continue; orig_entry = tmp_orig_entry; break; } rcu_read_unlock(); return orig_entry; } /** * batadv_tt_global_entry_has_orig() - check if a TT global entry is also * handled by a given originator * @entry: the TT global entry to check * @orig_node: the originator to search in the list * @flags: a pointer to store TT flags for the given @entry received * from @orig_node * * find out if an orig_node is already in the list of a tt_global_entry. * * Return: true if found, false otherwise */ static bool batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, const struct batadv_orig_node *orig_node, u8 *flags) { struct batadv_tt_orig_list_entry *orig_entry; bool found = false; orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node); if (orig_entry) { found = true; if (flags) *flags = orig_entry->flags; batadv_tt_orig_list_entry_put(orig_entry); } return found; } /** * batadv_tt_global_sync_flags() - update TT sync flags * @tt_global: the TT global entry to update sync flags in * * Updates the sync flag bits in the tt_global flag attribute with a logical * OR of all sync flags from any of its TT orig entries. */ static void batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global) { struct batadv_tt_orig_list_entry *orig_entry; const struct hlist_head *head; u16 flags = BATADV_NO_FLAGS; rcu_read_lock(); head = &tt_global->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) flags |= orig_entry->flags; rcu_read_unlock(); flags |= tt_global->common.flags & (~BATADV_TT_SYNC_MASK); tt_global->common.flags = flags; } /** * batadv_tt_global_orig_entry_add() - add or update a TT orig entry * @tt_global: the TT global entry to add an orig entry in * @orig_node: the originator to add an orig entry for * @ttvn: translation table version number of this changeset * @flags: TT sync flags */ static void batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, struct batadv_orig_node *orig_node, int ttvn, u8 flags) { struct batadv_tt_orig_list_entry *orig_entry; spin_lock_bh(&tt_global->list_lock); orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (orig_entry) { /* refresh the ttvn: the current value could be a bogus one that * was added during a "temporary client detection" */ orig_entry->ttvn = ttvn; orig_entry->flags = flags; goto sync_flags; } orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC); if (!orig_entry) goto out; INIT_HLIST_NODE(&orig_entry->list); kref_get(&orig_node->refcount); batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; orig_entry->flags = flags; kref_init(&orig_entry->refcount); kref_get(&orig_entry->refcount); hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); atomic_inc(&tt_global->orig_list_count); sync_flags: batadv_tt_global_sync_flags(tt_global); out: batadv_tt_orig_list_entry_put(orig_entry); spin_unlock_bh(&tt_global->list_lock); } /** * batadv_tt_global_add() - add a new TT global entry or update an existing one * @bat_priv: the bat priv with all the soft interface information * @orig_node: the originator announcing the client * @tt_addr: the mac address of the non-mesh client * @vid: VLAN identifier * @flags: TT flags that have to be set for this non-mesh client * @ttvn: the tt version number ever announcing this non-mesh client * * Add a new TT global entry for the given originator. If the entry already * exists add a new reference to the given originator (a global entry can have * references to multiple originators) and adjust the flags attribute to reflect * the function argument. * If a TT local entry exists for this non-mesh client remove it. * * The caller must hold the orig_node refcount. * * Return: true if the new entry has been added, false otherwise */ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *tt_addr, unsigned short vid, u16 flags, u8 ttvn) { struct batadv_tt_global_entry *tt_global_entry; struct batadv_tt_local_entry *tt_local_entry; bool ret = false; int hash_added; struct batadv_tt_common_entry *common; u16 local_flags; /* ignore global entries from backbone nodes */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) return true; tt_global_entry = batadv_tt_global_hash_find(bat_priv, tt_addr, vid); tt_local_entry = batadv_tt_local_hash_find(bat_priv, tt_addr, vid); /* if the node already has a local client for this entry, it has to wait * for a roaming advertisement instead of manually messing up the global * table */ if ((flags & BATADV_TT_CLIENT_TEMP) && tt_local_entry && !(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW)) goto out; if (!tt_global_entry) { tt_global_entry = kmem_cache_zalloc(batadv_tg_cache, GFP_ATOMIC); if (!tt_global_entry) goto out; common = &tt_global_entry->common; ether_addr_copy(common->addr, tt_addr); common->vid = vid; if (!is_multicast_ether_addr(common->addr)) common->flags = flags & (~BATADV_TT_SYNC_MASK); tt_global_entry->roam_at = 0; /* node must store current time in case of roaming. This is * needed to purge this entry out on timeout (if nobody claims * it) */ if (flags & BATADV_TT_CLIENT_ROAM) tt_global_entry->roam_at = jiffies; kref_init(&common->refcount); common->added_at = jiffies; INIT_HLIST_HEAD(&tt_global_entry->orig_list); atomic_set(&tt_global_entry->orig_list_count, 0); spin_lock_init(&tt_global_entry->list_lock); kref_get(&common->refcount); hash_added = batadv_hash_add(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, common, &common->hash_entry); if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_global_entry_put(tt_global_entry); goto out_remove; } } else { common = &tt_global_entry->common; /* If there is already a global entry, we can use this one for * our processing. * But if we are trying to add a temporary client then here are * two options at this point: * 1) the global client is not a temporary client: the global * client has to be left as it is, temporary information * should never override any already known client state * 2) the global client is a temporary client: purge the * originator list and add the new one orig_entry */ if (flags & BATADV_TT_CLIENT_TEMP) { if (!(common->flags & BATADV_TT_CLIENT_TEMP)) goto out; if (batadv_tt_global_entry_has_orig(tt_global_entry, orig_node, NULL)) goto out_remove; batadv_tt_global_del_orig_list(tt_global_entry); goto add_orig_entry; } /* if the client was temporary added before receiving the first * OGM announcing it, we have to clear the TEMP flag. Also, * remove the previous temporary orig node and re-add it * if required. If the orig entry changed, the new one which * is a non-temporary entry is preferred. */ if (common->flags & BATADV_TT_CLIENT_TEMP) { batadv_tt_global_del_orig_list(tt_global_entry); common->flags &= ~BATADV_TT_CLIENT_TEMP; } /* the change can carry possible "attribute" flags like the * TT_CLIENT_TEMP, therefore they have to be copied in the * client entry */ if (!is_multicast_ether_addr(common->addr)) common->flags |= flags & (~BATADV_TT_SYNC_MASK); /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a * delete + roaming change for this originator. * * We should first delete the old originator before adding the * new one. */ if (common->flags & BATADV_TT_CLIENT_ROAM) { batadv_tt_global_del_orig_list(tt_global_entry); common->flags &= ~BATADV_TT_CLIENT_ROAM; tt_global_entry->roam_at = 0; } } add_orig_entry: /* add the new orig_entry (if needed) or update it */ batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn, flags & BATADV_TT_SYNC_MASK); batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new global tt entry: %pM (vid: %d, via %pM)\n", common->addr, batadv_print_vid(common->vid), orig_node->orig); ret = true; out_remove: /* Do not remove multicast addresses from the local hash on * global additions */ if (is_multicast_ether_addr(tt_addr)) goto out; /* remove address from local hash if present */ local_flags = batadv_tt_local_remove(bat_priv, tt_addr, vid, "global tt received", flags & BATADV_TT_CLIENT_ROAM); tt_global_entry->common.flags |= local_flags & BATADV_TT_CLIENT_WIFI; if (!(flags & BATADV_TT_CLIENT_ROAM)) /* this is a normal global add. Therefore the client is not in a * roaming state anymore. */ tt_global_entry->common.flags &= ~BATADV_TT_CLIENT_ROAM; out: batadv_tt_global_entry_put(tt_global_entry); batadv_tt_local_entry_put(tt_local_entry); return ret; } /** * batadv_transtable_best_orig() - Get best originator list entry from tt entry * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be analyzed * * This function assumes the caller holds rcu_read_lock(). * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * batadv_transtable_best_orig(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry) { struct batadv_neigh_node *router, *best_router = NULL; struct batadv_algo_ops *bao = bat_priv->algo_ops; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry, *best_entry = NULL; head = &tt_global_entry->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { router = batadv_orig_router_get(orig_entry->orig_node, BATADV_IF_DEFAULT); if (!router) continue; if (best_router && bao->neigh.cmp(router, BATADV_IF_DEFAULT, best_router, BATADV_IF_DEFAULT) <= 0) { batadv_neigh_node_put(router); continue; } /* release the refcount for the "old" best */ batadv_neigh_node_put(best_router); best_entry = orig_entry; best_router = router; } batadv_neigh_node_put(best_router); return best_entry; } /** * batadv_tt_global_dump_subentry() - Dump all TT local entries into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @common: tt local & tt global common data * @orig: Originator node announcing a non-mesh client * @best: Is the best originator for the TT entry * * Return: Error code, or 0 on success */ static int batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_tt_common_entry *common, struct batadv_tt_orig_list_entry *orig, bool best) { u16 flags = (common->flags & (~BATADV_TT_SYNC_MASK)) | orig->flags; void *hdr; struct batadv_orig_node_vlan *vlan; u8 last_ttvn; u32 crc; vlan = batadv_orig_node_vlan_get(orig->orig_node, common->vid); if (!vlan) return 0; crc = vlan->tt.crc; batadv_orig_node_vlan_put(vlan); hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_TRANSTABLE_GLOBAL); if (!hdr) return -ENOBUFS; last_ttvn = atomic_read(&orig->orig_node->last_ttvn); if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig->orig_node->orig) || nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) || nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, flags)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_tt_global_dump_entry() - Dump one TT global entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @common: tt local & tt global common data * @sub_s: Number of entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_tt_common_entry *common, int *sub_s) { struct batadv_tt_orig_list_entry *orig_entry, *best_entry; struct batadv_tt_global_entry *global; struct hlist_head *head; int sub = 0; bool best; global = container_of(common, struct batadv_tt_global_entry, common); best_entry = batadv_transtable_best_orig(bat_priv, global); head = &global->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { if (sub++ < *sub_s) continue; best = (orig_entry == best_entry); if (batadv_tt_global_dump_subentry(msg, portid, seq, common, orig_entry, best)) { *sub_s = sub - 1; return -EMSGSIZE; } } *sub_s = 0; return 0; } /** * batadv_tt_global_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @head: Pointer to the list containing the global tt entries * @idx_s: Number of entries to skip * @sub: Number of entries to skip * * Return: Error code, or 0 on success */ static int batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_tt_common_entry *common; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(common, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv, common, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_tt_global_dump() - Dump TT global entries into a message * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: Error code, or length of message on success */ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; struct hlist_head *head; int ret; int ifindex; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return -EINVAL; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { ret = -ENODEV; goto out; } bat_priv = netdev_priv(soft_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } hash = bat_priv->tt.global_hash; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_tt_global_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, head, &idx, &sub)) break; bucket++; } ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; return ret; } /** * _batadv_tt_global_del_orig_entry() - remove and free an orig_entry * @tt_global_entry: the global entry to remove the orig_entry from * @orig_entry: the orig entry to remove and free * * Remove an orig_entry from its list in the given tt_global_entry and * free this orig_entry afterwards. * * Caller must hold tt_global_entry->list_lock and ensure orig_entry->list is * part of a list. */ static void _batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, struct batadv_tt_orig_list_entry *orig_entry) { lockdep_assert_held(&tt_global_entry->list_lock); batadv_tt_global_size_dec(orig_entry->orig_node, tt_global_entry->common.vid); atomic_dec(&tt_global_entry->orig_list_count); /* requires holding tt_global_entry->list_lock and orig_entry->list * being part of a list */ hlist_del_rcu(&orig_entry->list); batadv_tt_orig_list_entry_put(orig_entry); } /* deletes the orig list of a tt_global_entry */ static void batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) { struct hlist_head *head; struct hlist_node *safe; struct batadv_tt_orig_list_entry *orig_entry; spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; hlist_for_each_entry_safe(orig_entry, safe, head, list) _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); spin_unlock_bh(&tt_global_entry->list_lock); } /** * batadv_tt_global_del_orig_node() - remove orig_node from a global tt entry * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: the global entry to remove the orig_node from * @orig_node: the originator announcing the client * @message: message to append to the log on deletion * * Remove the given orig_node and its according orig_entry from the given * global tt entry. */ static void batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry, struct batadv_orig_node *orig_node, const char *message) { struct hlist_head *head; struct hlist_node *safe; struct batadv_tt_orig_list_entry *orig_entry; unsigned short vid; spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; hlist_for_each_entry_safe(orig_entry, safe, head, list) { if (orig_entry->orig_node == orig_node) { vid = tt_global_entry->common.vid; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting %pM from global tt entry %pM (vid: %d): %s\n", orig_node->orig, tt_global_entry->common.addr, batadv_print_vid(vid), message); _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); } } spin_unlock_bh(&tt_global_entry->list_lock); } /* If the client is to be deleted, we check if it is the last origantor entry * within tt_global entry. If yes, we set the BATADV_TT_CLIENT_ROAM flag and the * timer, otherwise we simply remove the originator scheduled for deletion. */ static void batadv_tt_global_del_roaming(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry, struct batadv_orig_node *orig_node, const char *message) { bool last_entry = true; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; /* no local entry exists, case 1: * Check if this is the last one or if other entries exist. */ rcu_read_lock(); head = &tt_global_entry->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { if (orig_entry->orig_node != orig_node) { last_entry = false; break; } } rcu_read_unlock(); if (last_entry) { /* its the last one, mark for roaming. */ tt_global_entry->common.flags |= BATADV_TT_CLIENT_ROAM; tt_global_entry->roam_at = jiffies; } else { /* there is another entry, we can simply delete this * one and can still use the other one. */ batadv_tt_global_del_orig_node(bat_priv, tt_global_entry, orig_node, message); } } /** * batadv_tt_global_del() - remove a client from the global table * @bat_priv: the bat priv with all the soft interface information * @orig_node: an originator serving this client * @addr: the mac address of the client * @vid: VLAN identifier * @message: a message explaining the reason for deleting the client to print * for debugging purpose * @roaming: true if the deletion has been triggered by a roaming event */ static void batadv_tt_global_del(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid, const char *message, bool roaming) { struct batadv_tt_global_entry *tt_global_entry; struct batadv_tt_local_entry *local_entry = NULL; tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) goto out; if (!roaming) { batadv_tt_global_del_orig_node(bat_priv, tt_global_entry, orig_node, message); if (hlist_empty(&tt_global_entry->orig_list)) batadv_tt_global_free(bat_priv, tt_global_entry, message); goto out; } /* if we are deleting a global entry due to a roam * event, there are two possibilities: * 1) the client roamed from node A to node B => if there * is only one originator left for this client, we mark * it with BATADV_TT_CLIENT_ROAM, we start a timer and we * wait for node B to claim it. In case of timeout * the entry is purged. * * If there are other originators left, we directly delete * the originator. * 2) the client roamed to us => we can directly delete * the global entry, since it is useless now. */ local_entry = batadv_tt_local_hash_find(bat_priv, tt_global_entry->common.addr, vid); if (local_entry) { /* local entry exists, case 2: client roamed to us. */ batadv_tt_global_del_orig_list(tt_global_entry); batadv_tt_global_free(bat_priv, tt_global_entry, message); } else { /* no local entry exists, case 1: check for roaming */ batadv_tt_global_del_roaming(bat_priv, tt_global_entry, orig_node, message); } out: batadv_tt_global_entry_put(tt_global_entry); batadv_tt_local_entry_put(local_entry); } /** * batadv_tt_global_del_orig() - remove all the TT global entries belonging to * the given originator matching the provided vid * @bat_priv: the bat priv with all the soft interface information * @orig_node: the originator owning the entries to remove * @match_vid: the VLAN identifier to match. If negative all the entries will be * removed * @message: debug message to print as "reason" */ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message) { struct batadv_tt_global_entry *tt_global; struct batadv_tt_common_entry *tt_common_entry; u32 i; struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct hlist_node *safe; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ unsigned short vid; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common_entry, safe, head, hash_entry) { /* remove only matching entries */ if (match_vid >= 0 && tt_common_entry->vid != match_vid) continue; tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); batadv_tt_global_del_orig_node(bat_priv, tt_global, orig_node, message); if (hlist_empty(&tt_global->orig_list)) { vid = tt_global->common.vid; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(vid), message); hlist_del_rcu(&tt_common_entry->hash_entry); batadv_tt_global_entry_put(tt_global); } } spin_unlock_bh(list_lock); } clear_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global, char **msg) { bool purge = false; unsigned long roam_timeout = BATADV_TT_CLIENT_ROAM_TIMEOUT; unsigned long temp_timeout = BATADV_TT_CLIENT_TEMP_TIMEOUT; if ((tt_global->common.flags & BATADV_TT_CLIENT_ROAM) && batadv_has_timed_out(tt_global->roam_at, roam_timeout)) { purge = true; *msg = "Roaming timeout\n"; } if ((tt_global->common.flags & BATADV_TT_CLIENT_TEMP) && batadv_has_timed_out(tt_global->common.added_at, temp_timeout)) { purge = true; *msg = "Temporary client timeout\n"; } return purge; } static void batadv_tt_global_purge(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct hlist_head *head; struct hlist_node *node_tmp; spinlock_t *list_lock; /* protects write access to the hash lists */ u32 i; char *msg = NULL; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common, node_tmp, head, hash_entry) { tt_global = container_of(tt_common, struct batadv_tt_global_entry, common); if (!batadv_tt_global_to_purge(tt_global, &msg)) continue; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(tt_global->common.vid), msg); hlist_del_rcu(&tt_common->hash_entry); batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } } static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_global_entry *tt_global; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; if (!bat_priv->tt.global_hash) return; hash = bat_priv->tt.global_hash; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { hlist_del_rcu(&tt_common_entry->hash_entry); tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } batadv_hash_destroy(hash); bat_priv->tt.global_hash = NULL; } static bool _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, struct batadv_tt_global_entry *tt_global_entry) { if (tt_local_entry->common.flags & BATADV_TT_CLIENT_WIFI && tt_global_entry->common.flags & BATADV_TT_CLIENT_WIFI) return true; /* check if the two clients are marked as isolated */ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_ISOLA && tt_global_entry->common.flags & BATADV_TT_CLIENT_ISOLA) return true; return false; } /** * batadv_transtable_search() - get the mesh destination for a given client * @bat_priv: the bat priv with all the soft interface information * @src: mac address of the source client * @addr: mac address of the destination client * @vid: VLAN identifier * * Return: a pointer to the originator that was selected as destination in the * mesh for contacting the client 'addr', NULL otherwise. * In case of multiple originators serving the same client, the function returns * the best one (best in terms of metric towards the destination node). * * If the two clients are AP isolated the function returns NULL. */ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, const u8 *src, const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry = NULL; struct batadv_tt_global_entry *tt_global_entry = NULL; struct batadv_orig_node *orig_node = NULL; struct batadv_tt_orig_list_entry *best_entry; if (src && batadv_vlan_ap_isola_get(bat_priv, vid)) { tt_local_entry = batadv_tt_local_hash_find(bat_priv, src, vid); if (!tt_local_entry || (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING)) goto out; } tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) goto out; /* check whether the clients should not communicate due to AP * isolation */ if (tt_local_entry && _batadv_is_ap_isolated(tt_local_entry, tt_global_entry)) goto out; rcu_read_lock(); best_entry = batadv_transtable_best_orig(bat_priv, tt_global_entry); /* found anything? */ if (best_entry) orig_node = best_entry->orig_node; if (orig_node && !kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; rcu_read_unlock(); out: batadv_tt_global_entry_put(tt_global_entry); batadv_tt_local_entry_put(tt_local_entry); return orig_node; } /** * batadv_tt_global_crc() - calculates the checksum of the local table belonging * to the given orig_node * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator for which the CRC should be computed * @vid: VLAN identifier for which the CRC32 has to be computed * * This function computes the checksum for the global table corresponding to a * specific originator. In particular, the checksum is computed as follows: For * each client connected to the originator the CRC32C of the MAC address and the * VID is computed and then all the CRC32Cs of the various clients are xor'ed * together. * * The idea behind is that CRC32C should be used as much as possible in order to * produce a unique hash of the table, but since the order which is used to feed * the CRC32C function affects the result and since every node in the network * probably sorts the clients differently, the hash function cannot be directly * computed over the entire table. Hence the CRC32C is used only on * the single client entry, while all the results are then xor'ed together * because the XOR operation can combine them all while trying to reduce the * noise as much as possible. * * Return: the checksum of the global table of a given originator. */ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct batadv_tt_orig_list_entry *tt_orig; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; u32 i, crc_tmp, crc = 0; u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_common, head, hash_entry) { tt_global = container_of(tt_common, struct batadv_tt_global_entry, common); /* compute the CRC only for entries belonging to the * VLAN identified by the vid passed as parameter */ if (tt_common->vid != vid) continue; /* Roaming clients are in the global table for * consistency only. They don't have to be * taken into account while computing the * global crc */ if (tt_common->flags & BATADV_TT_CLIENT_ROAM) continue; /* Temporary clients have not been announced yet, so * they have to be skipped while computing the global * crc */ if (tt_common->flags & BATADV_TT_CLIENT_TEMP) continue; /* find out if this global entry is announced by this * originator */ tt_orig = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (!tt_orig) continue; /* use network order to read the VID: this ensures that * every node reads the bytes in the same order. */ tmp_vid = htons(tt_common->vid); crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes */ flags = tt_orig->flags; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); batadv_tt_orig_list_entry_put(tt_orig); } rcu_read_unlock(); } return crc; } /** * batadv_tt_local_crc() - calculates the checksum of the local table * @bat_priv: the bat priv with all the soft interface information * @vid: VLAN identifier for which the CRC32 has to be computed * * For details about the computation, please refer to the documentation for * batadv_tt_global_crc(). * * Return: the checksum of the local table */ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct hlist_head *head; u32 i, crc_tmp, crc = 0; u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_common, head, hash_entry) { /* compute the CRC only for entries belonging to the * VLAN identified by vid */ if (tt_common->vid != vid) continue; /* not yet committed clients have not to be taken into * account while computing the CRC */ if (tt_common->flags & BATADV_TT_CLIENT_NEW) continue; /* use network order to read the VID: this ensures that * every node reads the bytes in the same order. */ tmp_vid = htons(tt_common->vid); crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes */ flags = tt_common->flags & BATADV_TT_SYNC_MASK; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); } rcu_read_unlock(); } return crc; } /** * batadv_tt_req_node_release() - free tt_req node entry * @ref: kref pointer of the tt req_node entry */ static void batadv_tt_req_node_release(struct kref *ref) { struct batadv_tt_req_node *tt_req_node; tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount); kmem_cache_free(batadv_tt_req_cache, tt_req_node); } /** * batadv_tt_req_node_put() - decrement the tt_req_node refcounter and * possibly release it * @tt_req_node: tt_req_node to be free'd */ static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node) { if (!tt_req_node) return; kref_put(&tt_req_node->refcount, batadv_tt_req_node_release); } static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_req_node *node; struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { hlist_del_init(&node->list); batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); } static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const void *tt_buff, u16 tt_buff_len) { /* Replace the old buffer only if I received something in the * last OGM (the OGM could carry no changes) */ spin_lock_bh(&orig_node->tt_buff_lock); if (tt_buff_len > 0) { kfree(orig_node->tt_buff); orig_node->tt_buff_len = 0; orig_node->tt_buff = kmalloc(tt_buff_len, GFP_ATOMIC); if (orig_node->tt_buff) { memcpy(orig_node->tt_buff, tt_buff, tt_buff_len); orig_node->tt_buff_len = tt_buff_len; } } spin_unlock_bh(&orig_node->tt_buff_lock); } static void batadv_tt_req_purge(struct batadv_priv *bat_priv) { struct batadv_tt_req_node *node; struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (batadv_has_timed_out(node->issued_at, BATADV_TT_REQUEST_TIMEOUT)) { hlist_del_init(&node->list); batadv_tt_req_node_put(node); } } spin_unlock_bh(&bat_priv->tt.req_list_lock); } /** * batadv_tt_req_node_new() - search and possibly create a tt_req_node object * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node this request is being issued for * * Return: the pointer to the new tt_req_node struct if no request * has already been issued for this orig_node, NULL otherwise. */ static struct batadv_tt_req_node * batadv_tt_req_node_new(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_tt_req_node *tt_req_node_tmp, *tt_req_node = NULL; spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) { if (batadv_compare_eth(tt_req_node_tmp, orig_node) && !batadv_has_timed_out(tt_req_node_tmp->issued_at, BATADV_TT_REQUEST_TIMEOUT)) goto unlock; } tt_req_node = kmem_cache_alloc(batadv_tt_req_cache, GFP_ATOMIC); if (!tt_req_node) goto unlock; kref_init(&tt_req_node->refcount); ether_addr_copy(tt_req_node->addr, orig_node->orig); tt_req_node->issued_at = jiffies; kref_get(&tt_req_node->refcount); hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list); unlock: spin_unlock_bh(&bat_priv->tt.req_list_lock); return tt_req_node; } /** * batadv_tt_local_valid() - verify local tt entry and get flags * @entry_ptr: to be checked local tt entry * @data_ptr: not used but definition required to satisfy the callback prototype * @flags: a pointer to store TT flags for this client to * * Checks the validity of the given local TT entry. If it is, then the provided * flags pointer is updated. * * Return: true if the entry is a valid, false otherwise. */ static bool batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr, u8 *flags) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; if (tt_common_entry->flags & BATADV_TT_CLIENT_NEW) return false; if (flags) *flags = tt_common_entry->flags; return true; } /** * batadv_tt_global_valid() - verify global tt entry and get flags * @entry_ptr: to be checked global tt entry * @data_ptr: an orig_node object (may be NULL) * @flags: a pointer to store TT flags for this client to * * Checks the validity of the given global TT entry. If it is, then the provided * flags pointer is updated either with the common (summed) TT flags if data_ptr * is NULL or the specific, per originator TT flags otherwise. * * Return: true if the entry is a valid, false otherwise. */ static bool batadv_tt_global_valid(const void *entry_ptr, const void *data_ptr, u8 *flags) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; const struct batadv_tt_global_entry *tt_global_entry; const struct batadv_orig_node *orig_node = data_ptr; if (tt_common_entry->flags & BATADV_TT_CLIENT_ROAM || tt_common_entry->flags & BATADV_TT_CLIENT_TEMP) return false; tt_global_entry = container_of(tt_common_entry, struct batadv_tt_global_entry, common); return batadv_tt_global_entry_has_orig(tt_global_entry, orig_node, flags); } /** * batadv_tt_tvlv_generate() - fill the tvlv buff with the tt entries from the * specified tt hash * @bat_priv: the bat priv with all the soft interface information * @hash: hash table containing the tt entries * @tt_len: expected tvlv tt data buffer length in number of bytes * @tvlv_buff: pointer to the buffer to fill with the TT data * @valid_cb: function to filter tt change entries and to return TT flags * @cb_data: data passed to the filter function as argument * * Fills the tvlv buff with the tt entries from the specified hash. If valid_cb * is not provided then this becomes a no-op. */ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, void *tvlv_buff, u16 tt_len, bool (*valid_cb)(const void *, const void *, u8 *flags), void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tvlv_tt_change *tt_change; struct hlist_head *head; u16 tt_tot, tt_num_entries = 0; u8 flags; bool ret; u32 i; tt_tot = batadv_tt_entries(tt_len); tt_change = (struct batadv_tvlv_tt_change *)tvlv_buff; if (!valid_cb) return; rcu_read_lock(); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { if (tt_tot == tt_num_entries) break; ret = valid_cb(tt_common_entry, cb_data, &flags); if (!ret) continue; ether_addr_copy(tt_change->addr, tt_common_entry->addr); tt_change->flags = flags; tt_change->vid = htons(tt_common_entry->vid); memset(tt_change->reserved, 0, sizeof(tt_change->reserved)); tt_num_entries++; tt_change++; } } rcu_read_unlock(); } /** * batadv_tt_global_check_crc() - check if all the CRCs are correct * @orig_node: originator for which the CRCs have to be checked * @tt_vlan: pointer to the first tvlv VLAN entry * @num_vlan: number of tvlv VLAN entries * * Return: true if all the received CRCs match the locally stored ones, false * otherwise */ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_vlan_data *tt_vlan, u16 num_vlan) { struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp; struct batadv_orig_node_vlan *vlan; int i, orig_num_vlan; u32 crc; /* check if each received CRC matches the locally stored one */ for (i = 0; i < num_vlan; i++) { tt_vlan_tmp = tt_vlan + i; /* if orig_node is a backbone node for this VLAN, don't check * the CRC as we ignore all the global entries over it */ if (batadv_bla_is_backbone_gw_orig(orig_node->bat_priv, orig_node->orig, ntohs(tt_vlan_tmp->vid))) continue; vlan = batadv_orig_node_vlan_get(orig_node, ntohs(tt_vlan_tmp->vid)); if (!vlan) return false; crc = vlan->tt.crc; batadv_orig_node_vlan_put(vlan); if (crc != ntohl(tt_vlan_tmp->crc)) return false; } /* check if any excess VLANs exist locally for the originator * which are not mentioned in the TVLV from the originator. */ rcu_read_lock(); orig_num_vlan = 0; hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) orig_num_vlan++; rcu_read_unlock(); if (orig_num_vlan > num_vlan) return false; return true; } /** * batadv_tt_local_update_crc() - update all the local CRCs * @bat_priv: the bat priv with all the soft interface information */ static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv) { struct batadv_softif_vlan *vlan; /* recompute the global CRC for each VLAN */ rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { vlan->tt.crc = batadv_tt_local_crc(bat_priv, vlan->vid); } rcu_read_unlock(); } /** * batadv_tt_global_update_crc() - update all the global CRCs for this orig_node * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig_node for which the CRCs have to be updated */ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_node_vlan *vlan; u32 crc; /* recompute the global CRC for each VLAN */ rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { /* if orig_node is a backbone node for this VLAN, don't compute * the CRC as we ignore all the global entries over it */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vlan->vid)) continue; crc = batadv_tt_global_crc(bat_priv, orig_node, vlan->vid); vlan->tt.crc = crc; } rcu_read_unlock(); } /** * batadv_send_tt_request() - send a TT Request message to a given node * @bat_priv: the bat priv with all the soft interface information * @dst_orig_node: the destination of the message * @ttvn: the version number that the source of the message is looking for * @tt_vlan: pointer to the first tvlv VLAN object to request * @num_vlan: number of tvlv VLAN entries * @full_table: ask for the entire translation table if true, while only for the * last TT diff otherwise * * Return: true if the TT Request was sent, false otherwise */ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, struct batadv_orig_node *dst_orig_node, u8 ttvn, struct batadv_tvlv_tt_vlan_data *tt_vlan, u16 num_vlan, bool full_table) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tt_req_node *tt_req_node = NULL; struct batadv_tvlv_tt_vlan_data *tt_vlan_req; struct batadv_hard_iface *primary_if; bool ret = false; int i, size; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* The new tt_req will be issued only if I'm not waiting for a * reply from the same orig_node yet */ tt_req_node = batadv_tt_req_node_new(bat_priv, dst_orig_node); if (!tt_req_node) goto out; size = sizeof(*tvlv_tt_data) + sizeof(*tt_vlan_req) * num_vlan; tvlv_tt_data = kzalloc(size, GFP_ATOMIC); if (!tvlv_tt_data) goto out; tvlv_tt_data->flags = BATADV_TT_REQUEST; tvlv_tt_data->ttvn = ttvn; tvlv_tt_data->num_vlan = htons(num_vlan); /* send all the CRCs within the request. This is needed by intermediate * nodes to ensure they have the correct table before replying */ tt_vlan_req = (struct batadv_tvlv_tt_vlan_data *)(tvlv_tt_data + 1); for (i = 0; i < num_vlan; i++) { tt_vlan_req->vid = tt_vlan->vid; tt_vlan_req->crc = tt_vlan->crc; tt_vlan_req++; tt_vlan++; } if (full_table) tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_REQUEST to %pM [%c]\n", dst_orig_node->orig, full_table ? 'F' : '.'); batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_TX); batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr, dst_orig_node->orig, BATADV_TVLV_TT, 1, tvlv_tt_data, size); ret = true; out: batadv_hardif_put(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); if (!hlist_unhashed(&tt_req_node->list)) { hlist_del_init(&tt_req_node->list); batadv_tt_req_node_put(tt_req_node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); } batadv_tt_req_node_put(tt_req_node); kfree(tvlv_tt_data); return ret; } /** * batadv_send_other_tt_response() - send reply to tt request concerning another * node's translation table * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *req_src, u8 *req_dst) { struct batadv_orig_node *req_dst_orig_node; struct batadv_orig_node *res_dst_orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tvlv_tt_vlan_data *tt_vlan; bool ret = false, full_table; u8 orig_ttvn, req_ttvn; u16 tvlv_len; s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n", req_src, tt_data->ttvn, req_dst, ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); /* Let's get the orig node of the REAL destination */ req_dst_orig_node = batadv_orig_hash_find(bat_priv, req_dst); if (!req_dst_orig_node) goto out; res_dst_orig_node = batadv_orig_hash_find(bat_priv, req_src); if (!res_dst_orig_node) goto out; orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn); req_ttvn = tt_data->ttvn; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); /* this node doesn't have the requested data */ if (orig_ttvn != req_ttvn || !batadv_tt_global_check_crc(req_dst_orig_node, tt_vlan, ntohs(tt_data->num_vlan))) goto out; /* If the full table has been explicitly requested */ if (tt_data->flags & BATADV_TT_FULL_TABLE || !req_dst_orig_node->tt_buff) full_table = true; else full_table = false; /* TT fragmentation hasn't been implemented yet, so send as many * TT entries fit a single packet as possible only */ if (!full_table) { spin_lock_bh(&req_dst_orig_node->tt_buff_lock); tt_len = req_dst_orig_node->tt_buff_len; tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len) goto unlock; /* Copy the last orig_node's OGM buffer */ memcpy(tt_change, req_dst_orig_node->tt_buff, req_dst_orig_node->tt_buff_len); spin_unlock_bh(&req_dst_orig_node->tt_buff_lock); } else { /* allocate the tvlv, put the tt_data and all the tt_vlan_data * in the initial part */ tt_len = -1; tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len) goto out; /* fill the rest of the tvlv with the real TT entries */ batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.global_hash, tt_change, tt_len, batadv_tt_global_valid, req_dst_orig_node); } /* Don't send the response, if larger than fragmented packet. */ tt_len = sizeof(struct batadv_unicast_tvlv_packet) + tvlv_len; if (tt_len > atomic_read(&bat_priv->packet_size_max)) { net_ratelimited_function(batadv_info, bat_priv->soft_iface, "Ignoring TT_REQUEST from %pM; Response size exceeds max packet size.\n", res_dst_orig_node->orig); goto out; } tvlv_tt_data->flags = BATADV_TT_RESPONSE; tvlv_tt_data->ttvn = req_ttvn; if (full_table) tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_RESPONSE %pM for %pM [%c] (ttvn: %u)\n", res_dst_orig_node->orig, req_dst_orig_node->orig, full_table ? 'F' : '.', req_ttvn); batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX); batadv_tvlv_unicast_send(bat_priv, req_dst_orig_node->orig, req_src, BATADV_TVLV_TT, 1, tvlv_tt_data, tvlv_len); ret = true; goto out; unlock: spin_unlock_bh(&req_dst_orig_node->tt_buff_lock); out: batadv_orig_node_put(res_dst_orig_node); batadv_orig_node_put(req_dst_orig_node); kfree(tvlv_tt_data); return ret; } /** * batadv_send_my_tt_response() - send reply to tt request concerning this * node's translation table * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *req_src) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_hard_iface *primary_if = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_orig_node *orig_node; u8 my_ttvn, req_ttvn; u16 tvlv_len; bool full_table; s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n", req_src, tt_data->ttvn, ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); spin_lock_bh(&bat_priv->tt.commit_lock); my_ttvn = (u8)atomic_read(&bat_priv->tt.vn); req_ttvn = tt_data->ttvn; orig_node = batadv_orig_hash_find(bat_priv, req_src); if (!orig_node) goto out; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* If the full table has been explicitly requested or the gap * is too big send the whole local translation table */ if (tt_data->flags & BATADV_TT_FULL_TABLE || my_ttvn != req_ttvn || !bat_priv->tt.last_changeset) full_table = true; else full_table = false; /* TT fragmentation hasn't been implemented yet, so send as many * TT entries fit a single packet as possible only */ if (!full_table) { spin_lock_bh(&bat_priv->tt.last_changeset_lock); tt_len = bat_priv->tt.last_changeset_len; tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len || !tvlv_len) goto unlock; /* Copy the last orig_node's OGM buffer */ memcpy(tt_change, bat_priv->tt.last_changeset, bat_priv->tt.last_changeset_len); spin_unlock_bh(&bat_priv->tt.last_changeset_lock); } else { req_ttvn = (u8)atomic_read(&bat_priv->tt.vn); /* allocate the tvlv, put the tt_data and all the tt_vlan_data * in the initial part */ tt_len = -1; tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len || !tvlv_len) goto out; /* fill the rest of the tvlv with the real TT entries */ batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.local_hash, tt_change, tt_len, batadv_tt_local_valid, NULL); } tvlv_tt_data->flags = BATADV_TT_RESPONSE; tvlv_tt_data->ttvn = req_ttvn; if (full_table) tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_RESPONSE to %pM [%c] (ttvn: %u)\n", orig_node->orig, full_table ? 'F' : '.', req_ttvn); batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX); batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr, req_src, BATADV_TVLV_TT, 1, tvlv_tt_data, tvlv_len); goto out; unlock: spin_unlock_bh(&bat_priv->tt.last_changeset_lock); out: spin_unlock_bh(&bat_priv->tt.commit_lock); batadv_orig_node_put(orig_node); batadv_hardif_put(primary_if); kfree(tvlv_tt_data); /* The packet was for this host, so it doesn't need to be re-routed */ return true; } /** * batadv_send_tt_response() - send reply to tt request * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *req_src, u8 *req_dst) { if (batadv_is_my_mac(bat_priv, req_dst)) return batadv_send_my_tt_response(bat_priv, tt_data, req_src); return batadv_send_other_tt_response(bat_priv, tt_data, req_src, req_dst); } static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_change *tt_change, u16 tt_num_changes, u8 ttvn) { int i; int roams; for (i = 0; i < tt_num_changes; i++) { if ((tt_change + i)->flags & BATADV_TT_CLIENT_DEL) { roams = (tt_change + i)->flags & BATADV_TT_CLIENT_ROAM; batadv_tt_global_del(bat_priv, orig_node, (tt_change + i)->addr, ntohs((tt_change + i)->vid), "tt removed by changes", roams); } else { if (!batadv_tt_global_add(bat_priv, orig_node, (tt_change + i)->addr, ntohs((tt_change + i)->vid), (tt_change + i)->flags, ttvn)) /* In case of problem while storing a * global_entry, we stop the updating * procedure without committing the * ttvn change. This will avoid to send * corrupted data on tt_request */ return; } } set_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_change *tt_change, u8 ttvn, u8 *resp_src, u16 num_entries) { struct batadv_orig_node *orig_node; orig_node = batadv_orig_hash_find(bat_priv, resp_src); if (!orig_node) goto out; /* Purge the old table first.. */ batadv_tt_global_del_orig(bat_priv, orig_node, -1, "Received full table"); _batadv_tt_update_changes(bat_priv, orig_node, tt_change, num_entries, ttvn); spin_lock_bh(&orig_node->tt_buff_lock); kfree(orig_node->tt_buff); orig_node->tt_buff_len = 0; orig_node->tt_buff = NULL; spin_unlock_bh(&orig_node->tt_buff_lock); atomic_set(&orig_node->last_ttvn, ttvn); out: batadv_orig_node_put(orig_node); } static void batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, u16 tt_num_changes, u8 ttvn, struct batadv_tvlv_tt_change *tt_change) { _batadv_tt_update_changes(bat_priv, orig_node, tt_change, tt_num_changes, ttvn); batadv_tt_save_orig_buffer(bat_priv, orig_node, tt_change, batadv_tt_len(tt_num_changes)); atomic_set(&orig_node->last_ttvn, ttvn); } /** * batadv_is_my_client() - check if a client is served by the local node * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to check * @vid: VLAN identifier * * Return: true if the client is served by this node, false otherwise. */ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; bool ret = false; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) goto out; /* Check if the client has been logically deleted (but is kept for * consistency purpose) */ if ((tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING) || (tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM)) goto out; ret = true; out: batadv_tt_local_entry_put(tt_local_entry); return ret; } /** * batadv_handle_tt_response() - process incoming tt reply * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @resp_src: mac address of tt reply sender * @num_entries: number of tt change entries appended to the tt data */ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *resp_src, u16 num_entries) { struct batadv_tt_req_node *node; struct hlist_node *safe; struct batadv_orig_node *orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; u8 *tvlv_ptr = (u8 *)tt_data; u16 change_offset; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n", resp_src, tt_data->ttvn, num_entries, ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); orig_node = batadv_orig_hash_find(bat_priv, resp_src); if (!orig_node) goto out; spin_lock_bh(&orig_node->tt_lock); change_offset = sizeof(struct batadv_tvlv_tt_vlan_data); change_offset *= ntohs(tt_data->num_vlan); change_offset += sizeof(*tt_data); tvlv_ptr += change_offset; tt_change = (struct batadv_tvlv_tt_change *)tvlv_ptr; if (tt_data->flags & BATADV_TT_FULL_TABLE) { batadv_tt_fill_gtable(bat_priv, tt_change, tt_data->ttvn, resp_src, num_entries); } else { batadv_tt_update_changes(bat_priv, orig_node, num_entries, tt_data->ttvn, tt_change); } /* Recalculate the CRC for this orig_node and store it */ batadv_tt_global_update_crc(bat_priv, orig_node); spin_unlock_bh(&orig_node->tt_lock); /* Delete the tt_req_node from pending tt_requests list */ spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (!batadv_compare_eth(node->addr, resp_src)) continue; hlist_del_init(&node->list); batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); out: batadv_orig_node_put(orig_node); } static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_roam_node *node, *safe; spin_lock_bh(&bat_priv->tt.roam_list_lock); list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) { list_del(&node->list); kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); } static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) { struct batadv_tt_roam_node *node, *safe; spin_lock_bh(&bat_priv->tt.roam_list_lock); list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) { if (!batadv_has_timed_out(node->first_time, BATADV_ROAMING_MAX_TIME)) continue; list_del(&node->list); kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); } /** * batadv_tt_check_roam_count() - check if a client has roamed too frequently * @bat_priv: the bat priv with all the soft interface information * @client: mac address of the roaming client * * This function checks whether the client already reached the * maximum number of possible roaming phases. In this case the ROAMING_ADV * will not be sent. * * Return: true if the ROAMING_ADV can be sent, false otherwise */ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) { struct batadv_tt_roam_node *tt_roam_node; bool ret = false; spin_lock_bh(&bat_priv->tt.roam_list_lock); /* The new tt_req will be issued only if I'm not waiting for a * reply from the same orig_node yet */ list_for_each_entry(tt_roam_node, &bat_priv->tt.roam_list, list) { if (!batadv_compare_eth(tt_roam_node->addr, client)) continue; if (batadv_has_timed_out(tt_roam_node->first_time, BATADV_ROAMING_MAX_TIME)) continue; if (!batadv_atomic_dec_not_zero(&tt_roam_node->counter)) /* Sorry, you roamed too many times! */ goto unlock; ret = true; break; } if (!ret) { tt_roam_node = kmem_cache_alloc(batadv_tt_roam_cache, GFP_ATOMIC); if (!tt_roam_node) goto unlock; tt_roam_node->first_time = jiffies; atomic_set(&tt_roam_node->counter, BATADV_ROAMING_MAX_COUNT - 1); ether_addr_copy(tt_roam_node->addr, client); list_add(&tt_roam_node->list, &bat_priv->tt.roam_list); ret = true; } unlock: spin_unlock_bh(&bat_priv->tt.roam_list_lock); return ret; } /** * batadv_send_roam_adv() - send a roaming advertisement message * @bat_priv: the bat priv with all the soft interface information * @client: mac address of the roaming client * @vid: VLAN identifier * @orig_node: message destination * * Send a ROAMING_ADV message to the node which was previously serving this * client. This is done to inform the node that from now on all traffic destined * for this particular roamed client has to be forwarded to the sender of the * roaming message. */ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node) { struct batadv_hard_iface *primary_if; struct batadv_tvlv_roam_adv tvlv_roam; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* before going on we have to check whether the client has * already roamed to us too many times */ if (!batadv_tt_check_roam_count(bat_priv, client)) goto out; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending ROAMING_ADV to %pM (client %pM, vid: %d)\n", orig_node->orig, client, batadv_print_vid(vid)); batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_TX); memcpy(tvlv_roam.client, client, sizeof(tvlv_roam.client)); tvlv_roam.vid = htons(vid); batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr, orig_node->orig, BATADV_TVLV_ROAM, 1, &tvlv_roam, sizeof(tvlv_roam)); out: batadv_hardif_put(primary_if); } static void batadv_tt_purge(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_tt *priv_tt; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_tt = container_of(delayed_work, struct batadv_priv_tt, work); bat_priv = container_of(priv_tt, struct batadv_priv, tt); batadv_tt_local_purge(bat_priv, BATADV_TT_LOCAL_TIMEOUT); batadv_tt_global_purge(bat_priv); batadv_tt_req_purge(bat_priv); batadv_tt_roam_purge(bat_priv); queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work, msecs_to_jiffies(BATADV_TT_WORK_PERIOD)); } /** * batadv_tt_free() - Free translation table of soft interface * @bat_priv: the bat priv with all the soft interface information */ void batadv_tt_free(struct batadv_priv *bat_priv) { batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_ROAM, 1); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_TT, 1); cancel_delayed_work_sync(&bat_priv->tt.work); batadv_tt_local_table_free(bat_priv); batadv_tt_global_table_free(bat_priv); batadv_tt_req_list_free(bat_priv); batadv_tt_changes_list_free(bat_priv); batadv_tt_roam_list_free(bat_priv); kfree(bat_priv->tt.last_changeset); } /** * batadv_tt_local_set_flags() - set or unset the specified flags on the local * table and possibly count them in the TT size * @bat_priv: the bat priv with all the soft interface information * @flags: the flag to switch * @enable: whether to set or unset the flag * @count: whether to increase the TT size by the number of changed entries */ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags, bool enable, bool count) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common_entry; struct hlist_head *head; u32 i; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { if (enable) { if ((tt_common_entry->flags & flags) == flags) continue; tt_common_entry->flags |= flags; } else { if (!(tt_common_entry->flags & flags)) continue; tt_common_entry->flags &= ~flags; } if (!count) continue; batadv_tt_local_size_inc(bat_priv, tt_common_entry->vid); } rcu_read_unlock(); } } /* Purge out all the tt local entries marked with BATADV_TT_CLIENT_PENDING */ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ u32 i; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common, node_tmp, head, hash_entry) { if (!(tt_common->flags & BATADV_TT_CLIENT_PENDING)) continue; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting local tt entry (%pM, vid: %d): pending\n", tt_common->addr, batadv_print_vid(tt_common->vid)); batadv_tt_local_size_dec(bat_priv, tt_common->vid); hlist_del_rcu(&tt_common->hash_entry); tt_local = container_of(tt_common, struct batadv_tt_local_entry, common); batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } } /** * batadv_tt_local_commit_changes_nolock() - commit all pending local tt changes * which have been queued in the time since the last commit * @bat_priv: the bat priv with all the soft interface information * * Caller must hold tt->commit_lock. */ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) { lockdep_assert_held(&bat_priv->tt.commit_lock); if (atomic_read(&bat_priv->tt.local_changes) < 1) { if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt)) batadv_tt_tvlv_container_update(bat_priv); return; } batadv_tt_local_set_flags(bat_priv, BATADV_TT_CLIENT_NEW, false, true); batadv_tt_local_purge_pending_clients(bat_priv); batadv_tt_local_update_crc(bat_priv); /* Increment the TTVN only once per OGM interval */ atomic_inc(&bat_priv->tt.vn); batadv_dbg(BATADV_DBG_TT, bat_priv, "Local changes committed, updating to ttvn %u\n", (u8)atomic_read(&bat_priv->tt.vn)); /* reset the sending counter */ atomic_set(&bat_priv->tt.ogm_append_cnt, BATADV_TT_OGM_APPEND_MAX); batadv_tt_tvlv_container_update(bat_priv); } /** * batadv_tt_local_commit_changes() - commit all pending local tt changes which * have been queued in the time since the last commit * @bat_priv: the bat priv with all the soft interface information */ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv) { spin_lock_bh(&bat_priv->tt.commit_lock); batadv_tt_local_commit_changes_nolock(bat_priv); spin_unlock_bh(&bat_priv->tt.commit_lock); } /** * batadv_is_ap_isolated() - Check if packet from upper layer should be dropped * @bat_priv: the bat priv with all the soft interface information * @src: source mac address of packet * @dst: destination mac address of packet * @vid: vlan id of packet * * Return: true when src+dst(+vid) pair should be isolated, false otherwise */ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; struct batadv_tt_global_entry *tt_global_entry; struct batadv_softif_vlan *vlan; bool ret = false; vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return false; if (!atomic_read(&vlan->ap_isolation)) goto vlan_put; tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid); if (!tt_local_entry) goto vlan_put; tt_global_entry = batadv_tt_global_hash_find(bat_priv, src, vid); if (!tt_global_entry) goto local_entry_put; if (_batadv_is_ap_isolated(tt_local_entry, tt_global_entry)) ret = true; batadv_tt_global_entry_put(tt_global_entry); local_entry_put: batadv_tt_local_entry_put(tt_local_entry); vlan_put: batadv_softif_vlan_put(vlan); return ret; } /** * batadv_tt_update_orig() - update global translation table with new tt * information received via ogms * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig_node of the ogm * @tt_buff: pointer to the first tvlv VLAN entry * @tt_num_vlan: number of tvlv VLAN entries * @tt_change: pointer to the first entry in the TT buffer * @tt_num_changes: number of tt changes inside the tt buffer * @ttvn: translation table version number of this changeset */ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const void *tt_buff, u16 tt_num_vlan, struct batadv_tvlv_tt_change *tt_change, u16 tt_num_changes, u8 ttvn) { u8 orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn); struct batadv_tvlv_tt_vlan_data *tt_vlan; bool full_table = true; bool has_tt_init; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)tt_buff; has_tt_init = test_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); /* orig table not initialised AND first diff is in the OGM OR the ttvn * increased by one -> we can apply the attached changes */ if ((!has_tt_init && ttvn == 1) || ttvn - orig_ttvn == 1) { /* the OGM could not contain the changes due to their size or * because they have already been sent BATADV_TT_OGM_APPEND_MAX * times. * In this case send a tt request */ if (!tt_num_changes) { full_table = false; goto request_table; } spin_lock_bh(&orig_node->tt_lock); batadv_tt_update_changes(bat_priv, orig_node, tt_num_changes, ttvn, tt_change); /* Even if we received the precomputed crc with the OGM, we * prefer to recompute it to spot any possible inconsistency * in the global table */ batadv_tt_global_update_crc(bat_priv, orig_node); spin_unlock_bh(&orig_node->tt_lock); /* The ttvn alone is not enough to guarantee consistency * because a single value could represent different states * (due to the wrap around). Thus a node has to check whether * the resulting table (after applying the changes) is still * consistent or not. E.g. a node could disconnect while its * ttvn is X and reconnect on ttvn = X + TTVN_MAX: in this case * checking the CRC value is mandatory to detect the * inconsistency */ if (!batadv_tt_global_check_crc(orig_node, tt_vlan, tt_num_vlan)) goto request_table; } else { /* if we missed more than one change or our tables are not * in sync anymore -> request fresh tt data */ if (!has_tt_init || ttvn != orig_ttvn || !batadv_tt_global_check_crc(orig_node, tt_vlan, tt_num_vlan)) { request_table: batadv_dbg(BATADV_DBG_TT, bat_priv, "TT inconsistency for %pM. Need to retrieve the correct information (ttvn: %u last_ttvn: %u num_changes: %u)\n", orig_node->orig, ttvn, orig_ttvn, tt_num_changes); batadv_send_tt_request(bat_priv, orig_node, ttvn, tt_vlan, tt_num_vlan, full_table); return; } } } /** * batadv_tt_global_client_is_roaming() - check if a client is marked as roaming * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to check * @vid: VLAN identifier * * Return: true if we know that the client has moved from its old originator * to another one. This entry is still kept for consistency purposes and will be * deleted later by a DEL or because of timeout */ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; bool ret = false; tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) goto out; ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM; batadv_tt_global_entry_put(tt_global_entry); out: return ret; } /** * batadv_tt_local_client_is_roaming() - tells whether the client is roaming * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the local client to query * @vid: VLAN identifier * * Return: true if the local client is known to be roaming (it is not served by * this node anymore) or not. If yes, the client is still present in the table * to keep the latter consistent with the node TTVN */ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; bool ret = false; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) goto out; ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM; batadv_tt_local_entry_put(tt_local_entry); out: return ret; } /** * batadv_tt_add_temporary_global_entry() - Add temporary entry to global TT * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which the temporary entry should be associated with * @addr: mac address of the client * @vid: VLAN id of the new temporary global translation table * * Return: true when temporary tt entry could be added, false otherwise */ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid) { /* ignore loop detect macs, they are not supposed to be in the tt local * data as well. */ if (batadv_bla_is_loopdetect_mac(addr)) return false; if (!batadv_tt_global_add(bat_priv, orig_node, addr, vid, BATADV_TT_CLIENT_TEMP, atomic_read(&orig_node->last_ttvn))) return false; batadv_dbg(BATADV_DBG_TT, bat_priv, "Added temporary global client (addr: %pM, vid: %d, orig: %pM)\n", addr, batadv_print_vid(vid), orig_node->orig); return true; } /** * batadv_tt_local_resize_to_mtu() - resize the local translation table fit the * maximum packet size that can be transported through the mesh * @soft_iface: netdev struct of the mesh interface * * Remove entries older than 'timeout' and half timeout if more entries need * to be removed. */ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); int packet_size_max = atomic_read(&bat_priv->packet_size_max); int table_size, timeout = BATADV_TT_LOCAL_TIMEOUT / 2; bool reduced = false; spin_lock_bh(&bat_priv->tt.commit_lock); while (true) { table_size = batadv_tt_local_table_transmit_size(bat_priv); if (packet_size_max >= table_size) break; batadv_tt_local_purge(bat_priv, timeout); batadv_tt_local_purge_pending_clients(bat_priv); timeout /= 2; reduced = true; net_ratelimited_function(batadv_info, soft_iface, "Forced to purge local tt entries to fit new maximum fragment MTU (%i)\n", packet_size_max); } /* commit these changes immediately, to avoid synchronization problem * with the TTVN */ if (reduced) batadv_tt_local_commit_changes_nolock(bat_priv); spin_unlock_bh(&bat_priv->tt.commit_lock); } /** * batadv_tt_tvlv_ogm_handler_v1() - process incoming tt tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data * @tvlv_value_len: tvlv buffer length */ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; u16 num_entries, num_vlan; if (tvlv_value_len < sizeof(*tt_data)) return; tt_data = (struct batadv_tvlv_tt_data *)tvlv_value; tvlv_value_len -= sizeof(*tt_data); num_vlan = ntohs(tt_data->num_vlan); if (tvlv_value_len < sizeof(*tt_vlan) * num_vlan) return; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan); tvlv_value_len -= sizeof(*tt_vlan) * num_vlan; num_entries = batadv_tt_entries(tvlv_value_len); batadv_tt_update_orig(bat_priv, orig, tt_vlan, num_vlan, tt_change, num_entries, tt_data->ttvn); } /** * batadv_tt_tvlv_unicast_handler_v1() - process incoming (unicast) tt tvlv * container * @bat_priv: the bat priv with all the soft interface information * @src: mac address of tt tvlv sender * @dst: mac address of tt tvlv recipient * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * * Return: NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_tt_data *tt_data; u16 tt_vlan_len, tt_num_entries; char tt_flag; bool ret; if (tvlv_value_len < sizeof(*tt_data)) return NET_RX_SUCCESS; tt_data = (struct batadv_tvlv_tt_data *)tvlv_value; tvlv_value_len -= sizeof(*tt_data); tt_vlan_len = sizeof(struct batadv_tvlv_tt_vlan_data); tt_vlan_len *= ntohs(tt_data->num_vlan); if (tvlv_value_len < tt_vlan_len) return NET_RX_SUCCESS; tvlv_value_len -= tt_vlan_len; tt_num_entries = batadv_tt_entries(tvlv_value_len); switch (tt_data->flags & BATADV_TT_DATA_TYPE_MASK) { case BATADV_TT_REQUEST: batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_RX); /* If this node cannot provide a TT response the tt_request is * forwarded */ ret = batadv_send_tt_response(bat_priv, tt_data, src, dst); if (!ret) { if (tt_data->flags & BATADV_TT_FULL_TABLE) tt_flag = 'F'; else tt_flag = '.'; batadv_dbg(BATADV_DBG_TT, bat_priv, "Routing TT_REQUEST to %pM [%c]\n", dst, tt_flag); /* tvlv API will re-route the packet */ return NET_RX_DROP; } break; case BATADV_TT_RESPONSE: batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_RX); if (batadv_is_my_mac(bat_priv, dst)) { batadv_handle_tt_response(bat_priv, tt_data, src, tt_num_entries); return NET_RX_SUCCESS; } if (tt_data->flags & BATADV_TT_FULL_TABLE) tt_flag = 'F'; else tt_flag = '.'; batadv_dbg(BATADV_DBG_TT, bat_priv, "Routing TT_RESPONSE to %pM [%c]\n", dst, tt_flag); /* tvlv API will re-route the packet */ return NET_RX_DROP; } return NET_RX_SUCCESS; } /** * batadv_roam_tvlv_unicast_handler_v1() - process incoming tt roam tvlv * container * @bat_priv: the bat priv with all the soft interface information * @src: mac address of tt tvlv sender * @dst: mac address of tt tvlv recipient * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * * Return: NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_roam_adv *roaming_adv; struct batadv_orig_node *orig_node = NULL; /* If this node is not the intended recipient of the * roaming advertisement the packet is forwarded * (the tvlv API will re-route the packet). */ if (!batadv_is_my_mac(bat_priv, dst)) return NET_RX_DROP; if (tvlv_value_len < sizeof(*roaming_adv)) goto out; orig_node = batadv_orig_hash_find(bat_priv, src); if (!orig_node) goto out; batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_RX); roaming_adv = (struct batadv_tvlv_roam_adv *)tvlv_value; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received ROAMING_ADV from %pM (client %pM)\n", src, roaming_adv->client); batadv_tt_global_add(bat_priv, orig_node, roaming_adv->client, ntohs(roaming_adv->vid), BATADV_TT_CLIENT_ROAM, atomic_read(&orig_node->last_ttvn) + 1); out: batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } /** * batadv_tt_init() - initialise the translation table internals * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure. */ int batadv_tt_init(struct batadv_priv *bat_priv) { int ret; /* synchronized flags must be remote */ BUILD_BUG_ON(!(BATADV_TT_SYNC_MASK & BATADV_TT_REMOTE_MASK)); ret = batadv_tt_local_init(bat_priv); if (ret < 0) return ret; ret = batadv_tt_global_init(bat_priv); if (ret < 0) { batadv_tt_local_table_free(bat_priv); return ret; } batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1, batadv_tt_tvlv_unicast_handler_v1, BATADV_TVLV_TT, 1, BATADV_NO_FLAGS); batadv_tvlv_handler_register(bat_priv, NULL, batadv_roam_tvlv_unicast_handler_v1, BATADV_TVLV_ROAM, 1, BATADV_NO_FLAGS); INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge); queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work, msecs_to_jiffies(BATADV_TT_WORK_PERIOD)); return 1; } /** * batadv_tt_global_is_isolated() - check if a client is marked as isolated * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client * @vid: the identifier of the VLAN where this client is connected * * Return: true if the client is marked with the TT_CLIENT_ISOLA flag, false * otherwise */ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt; bool ret; tt = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt) return false; ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA; batadv_tt_global_entry_put(tt); return ret; } /** * batadv_tt_cache_init() - Initialize tt memory object cache * * Return: 0 on success or negative error number in case of failure. */ int __init batadv_tt_cache_init(void) { size_t tl_size = sizeof(struct batadv_tt_local_entry); size_t tg_size = sizeof(struct batadv_tt_global_entry); size_t tt_orig_size = sizeof(struct batadv_tt_orig_list_entry); size_t tt_change_size = sizeof(struct batadv_tt_change_node); size_t tt_req_size = sizeof(struct batadv_tt_req_node); size_t tt_roam_size = sizeof(struct batadv_tt_roam_node); batadv_tl_cache = kmem_cache_create("batadv_tl_cache", tl_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tl_cache) return -ENOMEM; batadv_tg_cache = kmem_cache_create("batadv_tg_cache", tg_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tg_cache) goto err_tt_tl_destroy; batadv_tt_orig_cache = kmem_cache_create("batadv_tt_orig_cache", tt_orig_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_orig_cache) goto err_tt_tg_destroy; batadv_tt_change_cache = kmem_cache_create("batadv_tt_change_cache", tt_change_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_change_cache) goto err_tt_orig_destroy; batadv_tt_req_cache = kmem_cache_create("batadv_tt_req_cache", tt_req_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_req_cache) goto err_tt_change_destroy; batadv_tt_roam_cache = kmem_cache_create("batadv_tt_roam_cache", tt_roam_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_roam_cache) goto err_tt_req_destroy; return 0; err_tt_req_destroy: kmem_cache_destroy(batadv_tt_req_cache); batadv_tt_req_cache = NULL; err_tt_change_destroy: kmem_cache_destroy(batadv_tt_change_cache); batadv_tt_change_cache = NULL; err_tt_orig_destroy: kmem_cache_destroy(batadv_tt_orig_cache); batadv_tt_orig_cache = NULL; err_tt_tg_destroy: kmem_cache_destroy(batadv_tg_cache); batadv_tg_cache = NULL; err_tt_tl_destroy: kmem_cache_destroy(batadv_tl_cache); batadv_tl_cache = NULL; return -ENOMEM; } /** * batadv_tt_cache_destroy() - Destroy tt memory object cache */ void batadv_tt_cache_destroy(void) { kmem_cache_destroy(batadv_tl_cache); kmem_cache_destroy(batadv_tg_cache); kmem_cache_destroy(batadv_tt_orig_cache); kmem_cache_destroy(batadv_tt_change_cache); kmem_cache_destroy(batadv_tt_req_cache); kmem_cache_destroy(batadv_tt_roam_cache); }
564 1 1 564 150 416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for logging packets to userspace via * nfetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ipt_ULOG.c: * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/netfilter_bridge.h> #include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_log.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/spinlock.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/list.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netfilter/nf_log.h> #include <net/netns/generic.h> #include <linux/atomic.h> #include <linux/refcount.h> #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #define NFULNL_COPY_DISABLED 0xff #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ #define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ /* max packet size is limited by 16-bit struct nfattr nfa_len field */ #define NFULNL_COPY_RANGE_MAX (0xFFFF - NLA_HDRLEN) #define PRINTR(x, args...) do { if (net_ratelimit()) \ printk(x, ## args); } while (0); struct nfulnl_instance { struct hlist_node hlist; /* global list of instances */ spinlock_t lock; refcount_t use; /* use count */ unsigned int qlen; /* number of nlmsgs in skb */ struct sk_buff *skb; /* pre-allocatd skb */ struct timer_list timer; struct net *net; struct user_namespace *peer_user_ns; /* User namespace of the peer process */ u32 peer_portid; /* PORTID of the peer process */ /* configurable parameters */ unsigned int flushtimeout; /* timeout until queue flush */ unsigned int nlbufsiz; /* netlink buffer allocation size */ unsigned int qthreshold; /* threshold of the queue */ u_int32_t copy_range; u_int32_t seq; /* instance-local sequential counter */ u_int16_t group_num; /* number of this queue */ u_int16_t flags; u_int8_t copy_mode; struct rcu_head rcu; }; #define INSTANCE_BUCKETS 16 static unsigned int nfnl_log_net_id __read_mostly; struct nfnl_log_net { spinlock_t instances_lock; struct hlist_head instance_table[INSTANCE_BUCKETS]; atomic_t global_seq; }; static struct nfnl_log_net *nfnl_log_pernet(struct net *net) { return net_generic(net, nfnl_log_net_id); } static inline u_int8_t instance_hashfn(u_int16_t group_num) { return ((group_num & 0xff) % INSTANCE_BUCKETS); } static struct nfulnl_instance * __instance_lookup(struct nfnl_log_net *log, u_int16_t group_num) { struct hlist_head *head; struct nfulnl_instance *inst; head = &log->instance_table[instance_hashfn(group_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->group_num == group_num) return inst; } return NULL; } static inline void instance_get(struct nfulnl_instance *inst) { refcount_inc(&inst->use); } static struct nfulnl_instance * instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num) { struct nfulnl_instance *inst; rcu_read_lock_bh(); inst = __instance_lookup(log, group_num); if (inst && !refcount_inc_not_zero(&inst->use)) inst = NULL; rcu_read_unlock_bh(); return inst; } static void nfulnl_instance_free_rcu(struct rcu_head *head) { struct nfulnl_instance *inst = container_of(head, struct nfulnl_instance, rcu); put_net(inst->net); kfree(inst); module_put(THIS_MODULE); } static void instance_put(struct nfulnl_instance *inst) { if (inst && refcount_dec_and_test(&inst->use)) call_rcu(&inst->rcu, nfulnl_instance_free_rcu); } static void nfulnl_timer(struct timer_list *t); static struct nfulnl_instance * instance_create(struct net *net, u_int16_t group_num, u32 portid, struct user_namespace *user_ns) { struct nfulnl_instance *inst; struct nfnl_log_net *log = nfnl_log_pernet(net); int err; spin_lock_bh(&log->instances_lock); if (__instance_lookup(log, group_num)) { err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) { err = -ENOMEM; goto out_unlock; } if (!try_module_get(THIS_MODULE)) { kfree(inst); err = -EAGAIN; goto out_unlock; } INIT_HLIST_NODE(&inst->hlist); spin_lock_init(&inst->lock); /* needs to be two, since we _put() after creation */ refcount_set(&inst->use, 2); timer_setup(&inst->timer, nfulnl_timer, 0); inst->net = get_net(net); inst->peer_user_ns = user_ns; inst->peer_portid = portid; inst->group_num = group_num; inst->qthreshold = NFULNL_QTHRESH_DEFAULT; inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT; inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT; inst->copy_mode = NFULNL_COPY_PACKET; inst->copy_range = NFULNL_COPY_RANGE_MAX; hlist_add_head_rcu(&inst->hlist, &log->instance_table[instance_hashfn(group_num)]); spin_unlock_bh(&log->instances_lock); return inst; out_unlock: spin_unlock_bh(&log->instances_lock); return ERR_PTR(err); } static void __nfulnl_flush(struct nfulnl_instance *inst); /* called with BH disabled */ static void __instance_destroy(struct nfulnl_instance *inst) { /* first pull it out of the global list */ hlist_del_rcu(&inst->hlist); /* then flush all pending packets from skb */ spin_lock(&inst->lock); /* lockless readers wont be able to use us */ inst->copy_mode = NFULNL_COPY_DISABLED; if (inst->skb) __nfulnl_flush(inst); spin_unlock(&inst->lock); /* and finally put the refcount */ instance_put(inst); } static inline void instance_destroy(struct nfnl_log_net *log, struct nfulnl_instance *inst) { spin_lock_bh(&log->instances_lock); __instance_destroy(inst); spin_unlock_bh(&log->instances_lock); } static int nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode, unsigned int range) { int status = 0; spin_lock_bh(&inst->lock); switch (mode) { case NFULNL_COPY_NONE: case NFULNL_COPY_META: inst->copy_mode = mode; inst->copy_range = 0; break; case NFULNL_COPY_PACKET: inst->copy_mode = mode; if (range == 0) range = NFULNL_COPY_RANGE_MAX; inst->copy_range = min_t(unsigned int, range, NFULNL_COPY_RANGE_MAX); break; default: status = -EINVAL; break; } spin_unlock_bh(&inst->lock); return status; } static int nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz) { int status; spin_lock_bh(&inst->lock); if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT) status = -ERANGE; else if (nlbufsiz > 131072) status = -ERANGE; else { inst->nlbufsiz = nlbufsiz; status = 0; } spin_unlock_bh(&inst->lock); return status; } static void nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout) { spin_lock_bh(&inst->lock); inst->flushtimeout = timeout; spin_unlock_bh(&inst->lock); } static void nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh) { spin_lock_bh(&inst->lock); inst->qthreshold = qthresh; spin_unlock_bh(&inst->lock); } static int nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) { spin_lock_bh(&inst->lock); inst->flags = flags; spin_unlock_bh(&inst->lock); return 0; } static struct sk_buff * nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, unsigned int pkt_size) { struct sk_buff *skb; unsigned int n; /* alloc skb which should be big enough for a whole multipart * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ skb = alloc_skb(pkt_size, GFP_ATOMIC); } } return skb; } static void __nfulnl_send(struct nfulnl_instance *inst) { if (inst->qlen > 1) { struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0, NLMSG_DONE, sizeof(struct nfgenmsg), 0); if (WARN_ONCE(!nlh, "bad nlskb size: %u, tailroom %d\n", inst->skb->len, skb_tailroom(inst->skb))) { kfree_skb(inst->skb); goto out; } } nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid); out: inst->qlen = 0; inst->skb = NULL; } static void __nfulnl_flush(struct nfulnl_instance *inst) { /* timer holds a reference */ if (del_timer(&inst->timer)) instance_put(inst); if (inst->skb) __nfulnl_send(inst); } static void nfulnl_timer(struct timer_list *t) { struct nfulnl_instance *inst = from_timer(inst, t, timer); spin_lock_bh(&inst->lock); if (inst->skb) __nfulnl_send(inst); spin_unlock_bh(&inst->lock); instance_put(inst); } static u32 nfulnl_get_bridge_size(const struct sk_buff *skb) { u32 size = 0; if (!skb_mac_header_was_set(skb)) return 0; if (skb_vlan_tag_present(skb)) { size += nla_total_size(0); /* nested */ size += nla_total_size(sizeof(u16)); /* id */ size += nla_total_size(sizeof(u16)); /* tag */ } if (skb->network_header > skb->mac_header) size += nla_total_size(skb->network_header - skb->mac_header); return size; } static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb) { if (!skb_mac_header_was_set(skb)) return 0; if (skb_vlan_tag_present(skb)) { struct nlattr *nest; nest = nla_nest_start(inst->skb, NFULA_VLAN); if (!nest) goto nla_put_failure; if (nla_put_be16(inst->skb, NFULA_VLAN_TCI, htons(skb->vlan_tci)) || nla_put_be16(inst->skb, NFULA_VLAN_PROTO, skb->vlan_proto)) goto nla_put_failure; nla_nest_end(inst->skb, nest); } if (skb->mac_header < skb->network_header) { int len = (int)(skb->network_header - skb->mac_header); if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } /* This is an inline function, we don't really care about a long * list of arguments */ static inline int __build_packet_message(struct nfnl_log_net *log, struct nfulnl_instance *inst, const struct sk_buff *skb, unsigned int data_len, u_int8_t pf, unsigned int hooknum, const struct net_device *indev, const struct net_device *outdev, const char *prefix, unsigned int plen, const struct nfnl_ct_hook *nfnl_ct, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; nlh = nfnl_msg_put(inst->skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), 0, pf, NFNETLINK_V0, htons(inst->group_num)); if (!nlh) return -1; memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; pmsg.hook = hooknum; if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg)) goto nla_put_failure; if (prefix && nla_put(inst->skb, NFULA_PREFIX, plen, prefix)) goto nla_put_failure; if (indev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by nf_hook_thresh or * nf_log_packet. */ nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { struct net_device *physindev; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; physindev = nf_bridge_get_physindev(skb); if (physindev && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, htonl(physindev->ifindex))) goto nla_put_failure; } #endif } if (outdev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by nf_hook_thresh or * nf_log_packet. */ nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { struct net_device *physoutdev; /* Case 2: indev is a bridge group, we need to look * for physical device (when called from ipv4) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; physoutdev = nf_bridge_get_physoutdev(skb); if (physoutdev && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, htonl(physoutdev->ifindex))) goto nla_put_failure; } #endif } if (skb->mark && nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark))) goto nla_put_failure; if (indev && skb->dev && skb_mac_header_was_set(skb) && skb_mac_header_len(skb) != 0) { struct nfulnl_msg_packet_hw phw; int len; memset(&phw, 0, sizeof(phw)); len = dev_parse_header(skb, phw.hw_addr); if (len > 0) { phw.hw_addrlen = htons(len); if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw)) goto nla_put_failure; } } if (indev && skb_mac_header_was_set(skb)) { if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || nla_put_be16(inst->skb, NFULA_HWLEN, htons(skb->dev->hard_header_len))) goto nla_put_failure; hwhdrp = skb_mac_header(skb); if (skb->dev->type == ARPHRD_SIT) hwhdrp -= ETH_HLEN; if (hwhdrp >= skb->head && nla_put(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, hwhdrp)) goto nla_put_failure; } if (hooknum <= NF_INET_FORWARD && skb->tstamp) { struct nfulnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(skb->tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; } /* UID */ sk = skb->sk; if (sk && sk_fullsock(sk)) { read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { struct file *file = sk->sk_socket->file; const struct cred *cred = file->f_cred; struct user_namespace *user_ns = inst->peer_user_ns; __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid)); __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid)); read_unlock_bh(&sk->sk_callback_lock); if (nla_put_be32(inst->skb, NFULA_UID, uid) || nla_put_be32(inst->skb, NFULA_GID, gid)) goto nla_put_failure; } else read_unlock_bh(&sk->sk_callback_lock); } /* local sequence number */ if ((inst->flags & NFULNL_CFG_F_SEQ) && nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++))) goto nla_put_failure; /* global sequence number */ if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) && nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL, htonl(atomic_inc_return(&log->global_seq)))) goto nla_put_failure; if (ct && nfnl_ct->build(inst->skb, ct, ctinfo, NFULA_CT, NFULA_CT_INFO) < 0) goto nla_put_failure; if ((pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) && nfulnl_put_bridge(inst, skb) < 0) goto nla_put_failure; if (data_len) { struct nlattr *nla; int size = nla_attr_size(data_len); if (skb_tailroom(inst->skb) < nla_total_size(data_len)) goto nla_put_failure; nla = skb_put(inst->skb, nla_total_size(data_len)); nla->nla_type = NFULA_PAYLOAD; nla->nla_len = size; if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) BUG(); } nlh->nlmsg_len = inst->skb->tail - old_tail; return 0; nla_put_failure: PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); return -1; } static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_ULOG, .u = { .ulog = { .copy_len = 0xffff, .group = 0, .qthreshold = 1, }, }, }; /* log handler for internal netfilter logging api */ static void nfulnl_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *li_user, const char *prefix) { size_t size; unsigned int data_len; struct nfulnl_instance *inst; const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; struct nf_conn *ct = NULL; enum ip_conntrack_info ctinfo; if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; else li = &default_loginfo; inst = instance_lookup_get(log, li->u.ulog.group); if (!inst) return; if (prefix) plen = strlen(prefix) + 1; /* FIXME: do we want to make the size calculation conditional based on * what is actually present? way more branches and checks, but more * memory efficient... */ size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t)) /* gid */ + nla_total_size(plen) /* prefix */ + nla_total_size(sizeof(struct nfulnl_msg_packet_hw)) + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp)) + nla_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */ if (in && skb_mac_header_was_set(skb)) { size += nla_total_size(skb->dev->hard_header_len) + nla_total_size(sizeof(u_int16_t)) /* hwtype */ + nla_total_size(sizeof(u_int16_t)); /* hwlen */ } spin_lock_bh(&inst->lock); if (inst->flags & NFULNL_CFG_F_SEQ) size += nla_total_size(sizeof(u_int32_t)); if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) size += nla_total_size(sizeof(u_int32_t)); #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (inst->flags & NFULNL_CFG_F_CONNTRACK) { nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfnl_ct != NULL) { ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } #endif if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) size += nfulnl_get_bridge_size(skb); qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ if (li->u.ulog.qthreshold) if (qthreshold > li->u.ulog.qthreshold) qthreshold = li->u.ulog.qthreshold; switch (inst->copy_mode) { case NFULNL_COPY_META: case NFULNL_COPY_NONE: data_len = 0; break; case NFULNL_COPY_PACKET: data_len = inst->copy_range; if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) && (li->u.ulog.copy_len < data_len)) data_len = li->u.ulog.copy_len; if (data_len > skb->len) data_len = skb->len; size += nla_total_size(data_len); break; case NFULNL_COPY_DISABLED: default: goto unlock_and_release; } if (inst->skb && size > skb_tailroom(inst->skb)) { /* either the queue len is too high or we don't have * enough room in the skb left. flush to userspace. */ __nfulnl_flush(inst); } if (!inst->skb) { inst->skb = nfulnl_alloc_skb(net, inst->peer_portid, inst->nlbufsiz, size); if (!inst->skb) goto alloc_failure; } inst->qlen++; __build_packet_message(log, inst, skb, data_len, pf, hooknum, in, out, prefix, plen, nfnl_ct, ct, ctinfo); if (inst->qlen >= qthreshold) __nfulnl_flush(inst); /* timer_pending always called within inst->lock, so there * is no chance of a race here */ else if (!timer_pending(&inst->timer)) { instance_get(inst); inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100); add_timer(&inst->timer); } unlock_and_release: spin_unlock_bh(&inst->lock); instance_put(inst); return; alloc_failure: /* FIXME: statistics */ goto unlock_and_release; } static int nfulnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct nfnl_log_net *log = nfnl_log_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ spin_lock_bh(&log->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfulnl_instance *inst; struct hlist_head *head = &log->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { if (n->portid == inst->peer_portid) __instance_destroy(inst); } } spin_unlock_bh(&log->instances_lock); } return NOTIFY_DONE; } static struct notifier_block nfulnl_rtnl_notifier = { .notifier_call = nfulnl_rcv_nl_event, }; static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfula[]) { return -ENOTSUPP; } static struct nf_logger nfulnl_logger __read_mostly = { .name = "nfnetlink_log", .type = NF_LOG_TYPE_ULOG, .logfn = nfulnl_log_packet, .me = THIS_MODULE, }; static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { [NFULA_CFG_CMD] = { .len = sizeof(struct nfulnl_msg_config_cmd) }, [NFULA_CFG_MODE] = { .len = sizeof(struct nfulnl_msg_config_mode) }, [NFULA_CFG_TIMEOUT] = { .type = NLA_U32 }, [NFULA_CFG_QTHRESH] = { .type = NLA_U32 }, [NFULA_CFG_NLBUFSIZ] = { .type = NLA_U32 }, [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, }; static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfula[]) { struct nfnl_log_net *log = nfnl_log_pernet(info->net); u_int16_t group_num = ntohs(info->nfmsg->res_id); struct nfulnl_msg_config_cmd *cmd = NULL; struct nfulnl_instance *inst; u16 flags = 0; int ret = 0; if (nfula[NFULA_CFG_CMD]) { u_int8_t pf = info->nfmsg->nfgen_family; cmd = nla_data(nfula[NFULA_CFG_CMD]); /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: return nf_log_bind_pf(info->net, pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: nf_log_unbind_pf(info->net, pf); return 0; } } inst = instance_lookup_get(log, group_num); if (inst && inst->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto out_put; } /* Check if we support these flags in first place, dependencies should * be there too not to break atomicity. */ if (nfula[NFULA_CFG_FLAGS]) { flags = ntohs(nla_get_be16(nfula[NFULA_CFG_FLAGS])); if ((flags & NFULNL_CFG_F_CONNTRACK) && !rcu_access_pointer(nfnl_ct_hook)) { #ifdef CONFIG_MODULES nfnl_unlock(NFNL_SUBSYS_ULOG); request_module("ip_conntrack_netlink"); nfnl_lock(NFNL_SUBSYS_ULOG); if (rcu_access_pointer(nfnl_ct_hook)) { ret = -EAGAIN; goto out_put; } #endif ret = -EOPNOTSUPP; goto out_put; } } if (cmd != NULL) { switch (cmd->command) { case NFULNL_CFG_CMD_BIND: if (inst) { ret = -EBUSY; goto out_put; } inst = instance_create(info->net, group_num, NETLINK_CB(skb).portid, sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { ret = PTR_ERR(inst); goto out; } break; case NFULNL_CFG_CMD_UNBIND: if (!inst) { ret = -ENODEV; goto out; } instance_destroy(log, inst); goto out_put; default: ret = -ENOTSUPP; goto out_put; } } else if (!inst) { ret = -ENODEV; goto out; } if (nfula[NFULA_CFG_MODE]) { struct nfulnl_msg_config_mode *params = nla_data(nfula[NFULA_CFG_MODE]); nfulnl_set_mode(inst, params->copy_mode, ntohl(params->copy_range)); } if (nfula[NFULA_CFG_TIMEOUT]) { __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]); nfulnl_set_timeout(inst, ntohl(timeout)); } if (nfula[NFULA_CFG_NLBUFSIZ]) { __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]); nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); } if (nfula[NFULA_CFG_QTHRESH]) { __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]); nfulnl_set_qthresh(inst, ntohl(qthresh)); } if (nfula[NFULA_CFG_FLAGS]) nfulnl_set_flags(inst, flags); out_put: instance_put(inst); out: return ret; } static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, .type = NFNL_CB_MUTEX, .attr_count = NFULA_MAX, }, [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, .type = NFNL_CB_MUTEX, .attr_count = NFULA_CFG_MAX, .policy = nfula_cfg_policy }, }; static const struct nfnetlink_subsystem nfulnl_subsys = { .name = "log", .subsys_id = NFNL_SUBSYS_ULOG, .cb_count = NFULNL_MSG_MAX, .cb = nfulnl_cb, }; #ifdef CONFIG_PROC_FS struct iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct net *net, struct iter_state *st) { struct nfnl_log_net *log; if (!st) return NULL; log = nfnl_log_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { struct hlist_head *head = &log->instance_table[st->bucket]; if (!hlist_empty(head)) return rcu_dereference_bh(hlist_first_rcu(head)); } return NULL; } static struct hlist_node *get_next(struct net *net, struct iter_state *st, struct hlist_node *h) { h = rcu_dereference_bh(hlist_next_rcu(h)); while (!h) { struct nfnl_log_net *log; struct hlist_head *head; if (++st->bucket >= INSTANCE_BUCKETS) return NULL; log = nfnl_log_pernet(net); head = &log->instance_table[st->bucket]; h = rcu_dereference_bh(hlist_first_rcu(head)); } return h; } static struct hlist_node *get_idx(struct net *net, struct iter_state *st, loff_t pos) { struct hlist_node *head; head = get_first(net, st); if (head) while (pos && (head = get_next(net, st, head))) pos--; return pos ? NULL : head; } static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(rcu_bh) { rcu_read_lock_bh(); return get_idx(seq_file_net(s), s->private, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return get_next(seq_file_net(s), s->private, v); } static void seq_stop(struct seq_file *s, void *v) __releases(rcu_bh) { rcu_read_unlock_bh(); } static int seq_show(struct seq_file *s, void *v) { const struct nfulnl_instance *inst = v; seq_printf(s, "%5u %6u %5u %1u %5u %6u %2u\n", inst->group_num, inst->peer_portid, inst->qlen, inst->copy_mode, inst->copy_range, inst->flushtimeout, refcount_read(&inst->use)); return 0; } static const struct seq_operations nful_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, .show = seq_show, }; #endif /* PROC_FS */ static int __net_init nfnl_log_net_init(struct net *net) { unsigned int i; struct nfnl_log_net *log = nfnl_log_pernet(net); #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; #endif for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&log->instance_table[i]); spin_lock_init(&log->instances_lock); #ifdef CONFIG_PROC_FS proc = proc_create_net("nfnetlink_log", 0440, net->nf.proc_netfilter, &nful_seq_ops, sizeof(struct iter_state)); if (!proc) return -ENOMEM; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif return 0; } static void __net_exit nfnl_log_net_exit(struct net *net) { struct nfnl_log_net *log = nfnl_log_pernet(net); unsigned int i; #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); #endif nf_log_unset(net, &nfulnl_logger); for (i = 0; i < INSTANCE_BUCKETS; i++) WARN_ON_ONCE(!hlist_empty(&log->instance_table[i])); } static struct pernet_operations nfnl_log_net_ops = { .init = nfnl_log_net_init, .exit = nfnl_log_net_exit, .id = &nfnl_log_net_id, .size = sizeof(struct nfnl_log_net), }; static int __init nfnetlink_log_init(void) { int status; status = register_pernet_subsys(&nfnl_log_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); if (status < 0) { pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); if (status < 0) { pr_err("failed to register logger\n"); goto cleanup_subsys; } return status; cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfulnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_log_net_ops); out: return status; } static void __exit nfnetlink_log_fini(void) { nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); } MODULE_DESCRIPTION("netfilter userspace logging"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); MODULE_ALIAS_NF_LOGGER(AF_INET, 1); MODULE_ALIAS_NF_LOGGER(AF_INET6, 1); MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1); MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */ MODULE_ALIAS_NF_LOGGER(5, 1); /* NFPROTO_NETDEV */ module_init(nfnetlink_log_init); module_exit(nfnetlink_log_fini);
8 8 8 32 8 8 8 32 31 24 24 24 12 22 29 12 62 37 37 33 33 33 41 2 5 25 21 25 41 24 24 24 12 12 12 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/switchdev/switchdev.c - Switch device API * Copyright (c) 2014-2015 Jiri Pirko <jiri@resnulli.us> * Copyright (c) 2014-2015 Scott Feldman <sfeldma@gmail.com> */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_bridge.h> #include <linux/list.h> #include <linux/workqueue.h> #include <linux/if_vlan.h> #include <linux/rtnetlink.h> #include <net/switchdev.h> static LIST_HEAD(deferred); static DEFINE_SPINLOCK(deferred_lock); typedef void switchdev_deferred_func_t(struct net_device *dev, const void *data); struct switchdev_deferred_item { struct list_head list; struct net_device *dev; switchdev_deferred_func_t *func; unsigned long data[]; }; static struct switchdev_deferred_item *switchdev_deferred_dequeue(void) { struct switchdev_deferred_item *dfitem; spin_lock_bh(&deferred_lock); if (list_empty(&deferred)) { dfitem = NULL; goto unlock; } dfitem = list_first_entry(&deferred, struct switchdev_deferred_item, list); list_del(&dfitem->list); unlock: spin_unlock_bh(&deferred_lock); return dfitem; } /** * switchdev_deferred_process - Process ops in deferred queue * * Called to flush the ops currently queued in deferred ops queue. * rtnl_lock must be held. */ void switchdev_deferred_process(void) { struct switchdev_deferred_item *dfitem; ASSERT_RTNL(); while ((dfitem = switchdev_deferred_dequeue())) { dfitem->func(dfitem->dev, dfitem->data); dev_put(dfitem->dev); kfree(dfitem); } } EXPORT_SYMBOL_GPL(switchdev_deferred_process); static void switchdev_deferred_process_work(struct work_struct *work) { rtnl_lock(); switchdev_deferred_process(); rtnl_unlock(); } static DECLARE_WORK(deferred_process_work, switchdev_deferred_process_work); static int switchdev_deferred_enqueue(struct net_device *dev, const void *data, size_t data_len, switchdev_deferred_func_t *func) { struct switchdev_deferred_item *dfitem; dfitem = kmalloc(sizeof(*dfitem) + data_len, GFP_ATOMIC); if (!dfitem) return -ENOMEM; dfitem->dev = dev; dfitem->func = func; memcpy(dfitem->data, data, data_len); dev_hold(dev); spin_lock_bh(&deferred_lock); list_add_tail(&dfitem->list, &deferred); spin_unlock_bh(&deferred_lock); schedule_work(&deferred_process_work); return 0; } static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { int err; int rc; struct switchdev_notifier_port_attr_info attr_info = { .attr = attr, .handled = false, }; rc = call_switchdev_blocking_notifiers(nt, dev, &attr_info.info, extack); err = notifier_to_errno(rc); if (err) { WARN_ON(!attr_info.handled); return err; } if (!attr_info.handled) return -EOPNOTSUPP; return 0; } static int switchdev_port_attr_set_now(struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, extack); } static void switchdev_port_attr_set_deferred(struct net_device *dev, const void *data) { const struct switchdev_attr *attr = data; int err; err = switchdev_port_attr_set_now(dev, attr, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", err, attr->id); if (attr->complete) attr->complete(dev, err, attr->complete_priv); } static int switchdev_port_attr_set_defer(struct net_device *dev, const struct switchdev_attr *attr) { return switchdev_deferred_enqueue(dev, attr, sizeof(*attr), switchdev_port_attr_set_deferred); } /** * switchdev_port_attr_set - Set port attribute * * @dev: port device * @attr: attribute to set * @extack: netlink extended ack, for error message propagation * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { if (attr->flags & SWITCHDEV_F_DEFER) return switchdev_port_attr_set_defer(dev, attr); ASSERT_RTNL(); return switchdev_port_attr_set_now(dev, attr, extack); } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); static size_t switchdev_obj_size(const struct switchdev_obj *obj) { switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: return sizeof(struct switchdev_obj_port_vlan); case SWITCHDEV_OBJ_ID_PORT_MDB: return sizeof(struct switchdev_obj_port_mdb); case SWITCHDEV_OBJ_ID_HOST_MDB: return sizeof(struct switchdev_obj_port_mdb); default: BUG(); } return 0; } static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { int rc; int err; struct switchdev_notifier_port_obj_info obj_info = { .obj = obj, .handled = false, }; rc = call_switchdev_blocking_notifiers(nt, dev, &obj_info.info, extack); err = notifier_to_errno(rc); if (err) { WARN_ON(!obj_info.handled); return err; } if (!obj_info.handled) return -EOPNOTSUPP; return 0; } static void switchdev_port_obj_add_deferred(struct net_device *dev, const void *data) { const struct switchdev_obj *obj = data; int err; ASSERT_RTNL(); err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", err, obj->id); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_add_defer(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj), switchdev_port_obj_add_deferred); } /** * switchdev_port_obj_add - Add port object * * @dev: port device * @obj: object to add * @extack: netlink extended ack * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { if (obj->flags & SWITCHDEV_F_DEFER) return switchdev_port_obj_add_defer(dev, obj); ASSERT_RTNL(); return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, extack); } EXPORT_SYMBOL_GPL(switchdev_port_obj_add); static int switchdev_port_obj_del_now(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL, dev, obj, NULL); } static void switchdev_port_obj_del_deferred(struct net_device *dev, const void *data) { const struct switchdev_obj *obj = data; int err; err = switchdev_port_obj_del_now(dev, obj); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to del object (id=%d)\n", err, obj->id); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_del_defer(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj), switchdev_port_obj_del_deferred); } /** * switchdev_port_obj_del - Delete port object * * @dev: port device * @obj: object to delete * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_obj_del(struct net_device *dev, const struct switchdev_obj *obj) { if (obj->flags & SWITCHDEV_F_DEFER) return switchdev_port_obj_del_defer(dev, obj); ASSERT_RTNL(); return switchdev_port_obj_del_now(dev, obj); } EXPORT_SYMBOL_GPL(switchdev_port_obj_del); static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain); static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain); /** * register_switchdev_notifier - Register notifier * @nb: notifier_block * * Register switch device notifier. */ int register_switchdev_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&switchdev_notif_chain, nb); } EXPORT_SYMBOL_GPL(register_switchdev_notifier); /** * unregister_switchdev_notifier - Unregister notifier * @nb: notifier_block * * Unregister switch device notifier. */ int unregister_switchdev_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&switchdev_notif_chain, nb); } EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); /** * call_switchdev_notifiers - Call notifiers * @val: value passed unmodified to notifier function * @dev: port device * @info: notifier information data * @extack: netlink extended ack * Call all network notifier blocks. */ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack) { info->dev = dev; info->extack = extack; return atomic_notifier_call_chain(&switchdev_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); int register_switchdev_blocking_notifier(struct notifier_block *nb) { struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; return blocking_notifier_chain_register(chain, nb); } EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier); int unregister_switchdev_blocking_notifier(struct notifier_block *nb) { struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; return blocking_notifier_chain_unregister(chain, nb); } EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier); int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack) { info->dev = dev; info->extack = extack; return blocking_notifier_call_chain(&switchdev_blocking_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); struct switchdev_nested_priv { bool (*check_cb)(const struct net_device *dev); bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev); const struct net_device *dev; struct net_device *lower_dev; }; static int switchdev_lower_dev_walk(struct net_device *lower_dev, struct netdev_nested_priv *priv) { struct switchdev_nested_priv *switchdev_priv = priv->data; bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev); bool (*check_cb)(const struct net_device *dev); const struct net_device *dev; check_cb = switchdev_priv->check_cb; foreign_dev_check_cb = switchdev_priv->foreign_dev_check_cb; dev = switchdev_priv->dev; if (check_cb(lower_dev) && !foreign_dev_check_cb(lower_dev, dev)) { switchdev_priv->lower_dev = lower_dev; return 1; } return 0; } static struct net_device * switchdev_lower_dev_find(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev)) { struct switchdev_nested_priv switchdev_priv = { .check_cb = check_cb, .foreign_dev_check_cb = foreign_dev_check_cb, .dev = dev, .lower_dev = NULL, }; struct netdev_nested_priv priv = { .data = &switchdev_priv, }; netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv); return switchdev_priv.lower_dev; } static int __switchdev_handle_fdb_add_to_device(struct net_device *dev, const struct net_device *orig_dev, const struct switchdev_notifier_fdb_info *fdb_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*add_cb)(struct net_device *dev, const struct net_device *orig_dev, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info), int (*lag_add_cb)(struct net_device *dev, const struct net_device *orig_dev, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info)) { const struct switchdev_notifier_info *info = &fdb_info->info; struct net_device *br, *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; if (check_cb(dev)) return add_cb(dev, orig_dev, info->ctx, fdb_info); if (netif_is_lag_master(dev)) { if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) goto maybe_bridged_with_us; /* This is a LAG interface that we offload */ if (!lag_add_cb) return -EOPNOTSUPP; return lag_add_cb(dev, orig_dev, info->ctx, fdb_info); } /* Recurse through lower interfaces in case the FDB entry is pointing * towards a bridge device. */ if (netif_is_bridge_master(dev)) { if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) return 0; /* This is a bridge interface that we offload */ netdev_for_each_lower_dev(dev, lower_dev, iter) { /* Do not propagate FDB entries across bridges */ if (netif_is_bridge_master(lower_dev)) continue; /* Bridge ports might be either us, or LAG interfaces * that we offload. */ if (!check_cb(lower_dev) && !switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb)) continue; err = __switchdev_handle_fdb_add_to_device(lower_dev, orig_dev, fdb_info, check_cb, foreign_dev_check_cb, add_cb, lag_add_cb); if (err && err != -EOPNOTSUPP) return err; } return 0; } maybe_bridged_with_us: /* Event is neither on a bridge nor a LAG. Check whether it is on an * interface that is in a bridge with us. */ br = netdev_master_upper_dev_get_rcu(dev); if (!br || !netif_is_bridge_master(br)) return 0; if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb)) return 0; return __switchdev_handle_fdb_add_to_device(br, orig_dev, fdb_info, check_cb, foreign_dev_check_cb, add_cb, lag_add_cb); } int switchdev_handle_fdb_add_to_device(struct net_device *dev, const struct switchdev_notifier_fdb_info *fdb_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*add_cb)(struct net_device *dev, const struct net_device *orig_dev, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info), int (*lag_add_cb)(struct net_device *dev, const struct net_device *orig_dev, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info)) { int err; err = __switchdev_handle_fdb_add_to_device(dev, dev, fdb_info, check_cb, foreign_dev_check_cb, add_cb, lag_add_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_fdb_add_to_device); static int __switchdev_handle_fdb_del_to_device(struct net_device *dev, const struct net_device *orig_dev, const struct switchdev_notifier_fdb_info *fdb_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*del_cb)(struct net_device *dev, const struct net_device *orig_dev, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info), int (*lag_del_cb)(struct net_device *dev, const struct net_device *orig_dev, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info)) { const struct switchdev_notifier_info *info = &fdb_info->info; struct net_device *br, *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; if (check_cb(dev)) return del_cb(dev, orig_dev, info->ctx, fdb_info); if (netif_is_lag_master(dev)) { if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) goto maybe_bridged_with_us; /* This is a LAG interface that we offload */ if (!lag_del_cb) return -EOPNOTSUPP; return lag_del_cb(dev, orig_dev, info->ctx, fdb_info); } /* Recurse through lower interfaces in case the FDB entry is pointing * towards a bridge device. */ if (netif_is_bridge_master(dev)) { if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) return 0; /* This is a bridge interface that we offload */ netdev_for_each_lower_dev(dev, lower_dev, iter) { /* Do not propagate FDB entries across bridges */ if (netif_is_bridge_master(lower_dev)) continue; /* Bridge ports might be either us, or LAG interfaces * that we offload. */ if (!check_cb(lower_dev) && !switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb)) continue; err = __switchdev_handle_fdb_del_to_device(lower_dev, orig_dev, fdb_info, check_cb, foreign_dev_check_cb, del_cb, lag_del_cb); if (err && err != -EOPNOTSUPP) return err; } return 0; } maybe_bridged_with_us: /* Event is neither on a bridge nor a LAG. Check whether it is on an * interface that is in a bridge with us. */ br = netdev_master_upper_dev_get_rcu(dev); if (!br || !netif_is_bridge_master(br)) return 0; if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb)) return 0; return __switchdev_handle_fdb_del_to_device(br, orig_dev, fdb_info, check_cb, foreign_dev_check_cb, del_cb, lag_del_cb); } int switchdev_handle_fdb_del_to_device(struct net_device *dev, const struct switchdev_notifier_fdb_info *fdb_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*del_cb)(struct net_device *dev, const struct net_device *orig_dev, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info), int (*lag_del_cb)(struct net_device *dev, const struct net_device *orig_dev, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info)) { int err; err = __switchdev_handle_fdb_del_to_device(dev, dev, fdb_info, check_cb, foreign_dev_check_cb, del_cb, lag_del_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_fdb_del_to_device); static int __switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { struct switchdev_notifier_info *info = &port_obj_info->info; struct netlink_ext_ack *extack; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; extack = switchdev_notifier_info_to_extack(info); if (check_cb(dev)) { err = add_cb(dev, info->ctx, port_obj_info->obj, extack); if (err != -EOPNOTSUPP) port_obj_info->handled = true; return err; } /* Switch ports might be stacked under e.g. a LAG. Ignore the * unsupported devices, another driver might be able to handle them. But * propagate to the callers any hard errors. * * If the driver does its own bookkeeping of stacked ports, it's not * necessary to go through this helper. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { if (netif_is_bridge_master(lower_dev)) continue; err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info, check_cb, add_cb); if (err && err != -EOPNOTSUPP) return err; } return err; } int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { int err; err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb, add_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add); static int __switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { struct switchdev_notifier_info *info = &port_obj_info->info; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; if (check_cb(dev)) { err = del_cb(dev, info->ctx, port_obj_info->obj); if (err != -EOPNOTSUPP) port_obj_info->handled = true; return err; } /* Switch ports might be stacked under e.g. a LAG. Ignore the * unsupported devices, another driver might be able to handle them. But * propagate to the callers any hard errors. * * If the driver does its own bookkeeping of stacked ports, it's not * necessary to go through this helper. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { if (netif_is_bridge_master(lower_dev)) continue; err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info, check_cb, del_cb); if (err && err != -EOPNOTSUPP) return err; } return err; } int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { int err; err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb, del_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del); static int __switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)) { struct switchdev_notifier_info *info = &port_attr_info->info; struct netlink_ext_ack *extack; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; extack = switchdev_notifier_info_to_extack(info); if (check_cb(dev)) { err = set_cb(dev, info->ctx, port_attr_info->attr, extack); if (err != -EOPNOTSUPP) port_attr_info->handled = true; return err; } /* Switch ports might be stacked under e.g. a LAG. Ignore the * unsupported devices, another driver might be able to handle them. But * propagate to the callers any hard errors. * * If the driver does its own bookkeeping of stacked ports, it's not * necessary to go through this helper. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { if (netif_is_bridge_master(lower_dev)) continue; err = __switchdev_handle_port_attr_set(lower_dev, port_attr_info, check_cb, set_cb); if (err && err != -EOPNOTSUPP) return err; } return err; } int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)) { int err; err = __switchdev_handle_port_attr_set(dev, port_attr_info, check_cb, set_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_attr_set); int switchdev_bridge_port_offload(struct net_device *brport_dev, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, bool tx_fwd_offload, struct netlink_ext_ack *extack) { struct switchdev_notifier_brport_info brport_info = { .brport = { .dev = dev, .ctx = ctx, .atomic_nb = atomic_nb, .blocking_nb = blocking_nb, .tx_fwd_offload = tx_fwd_offload, }, }; int err; ASSERT_RTNL(); err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_OFFLOADED, brport_dev, &brport_info.info, extack); return notifier_to_errno(err); } EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload); void switchdev_bridge_port_unoffload(struct net_device *brport_dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { struct switchdev_notifier_brport_info brport_info = { .brport = { .ctx = ctx, .atomic_nb = atomic_nb, .blocking_nb = blocking_nb, }, }; ASSERT_RTNL(); call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_UNOFFLOADED, brport_dev, &brport_info.info, NULL); } EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the ICMP protocol. * * Version: @(#)icmp.h 1.0.3 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_ICMP_H #define _LINUX_ICMP_H #include <linux/skbuff.h> #include <uapi/linux/icmp.h> #include <uapi/linux/errqueue.h> static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { return (struct icmphdr *)skb_transport_header(skb); } static inline bool icmp_is_err(int type) { switch (type) { case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_REDIRECT: case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: return true; } return false; } void ip_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out, int thlen, int off); #endif /* _LINUX_ICMP_H */
564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // SPDX-License-Identifier: GPL-2.0-only /* Accounting handling for netfilter. */ /* * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> static bool nf_ct_acct __read_mostly; module_param_named(acct, nf_ct_acct, bool, 0644); MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); static const struct nf_ct_ext_type acct_extend = { .len = sizeof(struct nf_conn_acct), .align = __alignof__(struct nf_conn_acct), .id = NF_CT_EXT_ACCT, }; void nf_conntrack_acct_pernet_init(struct net *net) { net->ct.sysctl_acct = nf_ct_acct; } int nf_conntrack_acct_init(void) { int ret = nf_ct_extend_register(&acct_extend); if (ret < 0) pr_err("Unable to register extension\n"); return ret; } void nf_conntrack_acct_fini(void) { nf_ct_extend_unregister(&acct_extend); }
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/gfp.h> #include <net/sock.h> #include <linux/in.h> #include <linux/list.h> #include <linux/ratelimit.h> #include <linux/export.h> #include <linux/sizes.h> #include "rds.h" /* When transmitting messages in rds_send_xmit, we need to emerge from * time to time and briefly release the CPU. Otherwise the softlock watchdog * will kick our shin. * Also, it seems fairer to not let one busy connection stall all the * others. * * send_batch_count is the number of times we'll loop in send_xmit. Setting * it to 0 will restore the old behavior (where we looped until we had * drained the queue). */ static int send_batch_count = SZ_1K; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); static void rds_send_remove_from_sock(struct list_head *messages, int status); /* * Reset the send state. Callers must ensure that this doesn't race with * rds_send_xmit(). */ void rds_send_path_reset(struct rds_conn_path *cp) { struct rds_message *rm, *tmp; unsigned long flags; if (cp->cp_xmit_rm) { rm = cp->cp_xmit_rm; cp->cp_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ rds_message_unmapped(rm); rds_message_put(rm); } cp->cp_xmit_sg = 0; cp->cp_xmit_hdr_off = 0; cp->cp_xmit_data_off = 0; cp->cp_xmit_atomic_sent = 0; cp->cp_xmit_rdma_sent = 0; cp->cp_xmit_data_sent = 0; cp->cp_conn->c_map_queued = 0; cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; /* Mark messages as retransmissions, and move them to the send q */ spin_lock_irqsave(&cp->cp_lock, flags); list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); } list_splice_init(&cp->cp_retrans, &cp->cp_send_queue); spin_unlock_irqrestore(&cp->cp_lock, flags); } EXPORT_SYMBOL_GPL(rds_send_path_reset); static int acquire_in_xmit(struct rds_conn_path *cp) { return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0; } static void release_in_xmit(struct rds_conn_path *cp) { clear_bit(RDS_IN_XMIT, &cp->cp_flags); smp_mb__after_atomic(); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk * the system-wide hashed waitqueue buckets in the fast path only to * almost never find waiters. */ if (waitqueue_active(&cp->cp_waitq)) wake_up_all(&cp->cp_waitq); } /* * We're making the conscious trade-off here to only send one message * down the connection at a time. * Pro: * - tx queueing is a simple fifo list * - reassembly is optional and easily done by transports per conn * - no per flow rx lookup at all, straight to the socket * - less per-frag memory and wire overhead * Con: * - queued acks can be delayed behind large messages * Depends: * - small message latency is higher behind queued large messages * - large message latency isn't starved by intervening small sends */ int rds_send_xmit(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; struct rds_message *rm; unsigned long flags; unsigned int tmp; struct scatterlist *sg; int ret = 0; LIST_HEAD(to_be_dropped); int batch_count; unsigned long send_gen = 0; int same_rm = 0; restart: batch_count = 0; /* * sendmsg calls here after having queued its message on the send * queue. We only have one task feeding the connection at a time. If * another thread is already feeding the queue then we back off. This * avoids blocking the caller and trading per-connection data between * caches per message. */ if (!acquire_in_xmit(cp)) { rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; } if (rds_destroy_pending(cp->cp_conn)) { release_in_xmit(cp); ret = -ENETUNREACH; /* dont requeue send work */ goto out; } /* * we record the send generation after doing the xmit acquire. * if someone else manages to jump in and do some work, we'll use * this to avoid a goto restart farther down. * * The acquire_in_xmit() check above ensures that only one * caller can increment c_send_gen at any time. */ send_gen = READ_ONCE(cp->cp_send_gen) + 1; WRITE_ONCE(cp->cp_send_gen, send_gen); /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, * we do the opposite to avoid races. */ if (!rds_conn_path_up(cp)) { release_in_xmit(cp); ret = 0; goto out; } if (conn->c_trans->xmit_path_prepare) conn->c_trans->xmit_path_prepare(cp); /* * spin trying to push headers and data down the connection until * the connection doesn't make forward progress. */ while (1) { rm = cp->cp_xmit_rm; if (!rm) { same_rm = 0; } else { same_rm++; if (same_rm >= 4096) { rds_stats_inc(s_send_stuck_rm); ret = -EAGAIN; break; } } /* * If between sending messages, we can send a pending congestion * map update. */ if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { rm = rds_cong_update_alloc(conn); if (IS_ERR(rm)) { ret = PTR_ERR(rm); break; } rm->data.op_active = 1; rm->m_inc.i_conn_path = cp; rm->m_inc.i_conn = cp->cp_conn; cp->cp_xmit_rm = rm; } /* * If not already working on one, grab the next message. * * cp_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ if (!rm) { unsigned int len; batch_count++; /* we want to process as big a batch as we can, but * we also want to avoid softlockups. If we've been * through a lot of messages, lets back off and see * if anyone else jumps in */ if (batch_count >= send_batch_count) goto over_batch; spin_lock_irqsave(&cp->cp_lock, flags); if (!list_empty(&cp->cp_send_queue)) { rm = list_entry(cp->cp_send_queue.next, struct rds_message, m_conn_item); rds_message_addref(rm); /* * Move the message from the send queue to the retransmit * list right away. */ list_move_tail(&rm->m_conn_item, &cp->cp_retrans); } spin_unlock_irqrestore(&cp->cp_lock, flags); if (!rm) break; /* Unfortunately, the way Infiniband deals with * RDMA to a bad MR key is by moving the entire * queue pair to error state. We cold possibly * recover from that, but right now we drop the * connection. * Therefore, we never retransmit messages with RDMA ops. */ if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) || (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) { spin_lock_irqsave(&cp->cp_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } /* Require an ACK every once in a while */ len = ntohl(rm->m_inc.i_hdr.h_len); if (cp->cp_unacked_packets == 0 || cp->cp_unacked_bytes < len) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; rds_stats_inc(s_send_ack_required); } else { cp->cp_unacked_bytes -= len; cp->cp_unacked_packets--; } cp->cp_xmit_rm = rm; } /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); wake_up_interruptible(&rm->m_flush_wait); break; } cp->cp_xmit_rdma_sent = 1; } if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); if (ret) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); wake_up_interruptible(&rm->m_flush_wait); break; } cp->cp_xmit_atomic_sent = 1; } /* * A number of cases require an RDS header to be sent * even if there is no data. * We permit 0-byte sends; rds-ping depends on this. * However, if there are exclusively attached silent ops, * we skip the hdr/data send, to enable silent operation. */ if (rm->data.op_nents == 0) { int ops_present; int all_ops_are_silent = 1; ops_present = (rm->atomic.op_active || rm->rdma.op_active); if (rm->atomic.op_active && !rm->atomic.op_silent) all_ops_are_silent = 0; if (rm->rdma.op_active && !rm->rdma.op_silent) all_ops_are_silent = 0; if (ops_present && all_ops_are_silent && !rm->m_rdma_cookie) rm->data.op_active = 0; } if (rm->data.op_active && !cp->cp_xmit_data_sent) { rm->m_final_op = &rm->data; ret = conn->c_trans->xmit(conn, rm, cp->cp_xmit_hdr_off, cp->cp_xmit_sg, cp->cp_xmit_data_off); if (ret <= 0) break; if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) { tmp = min_t(int, ret, sizeof(struct rds_header) - cp->cp_xmit_hdr_off); cp->cp_xmit_hdr_off += tmp; ret -= tmp; } sg = &rm->data.op_sg[cp->cp_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - cp->cp_xmit_data_off); cp->cp_xmit_data_off += tmp; ret -= tmp; if (cp->cp_xmit_data_off == sg->length) { cp->cp_xmit_data_off = 0; sg++; cp->cp_xmit_sg++; BUG_ON(ret != 0 && cp->cp_xmit_sg == rm->data.op_nents); } } if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) && (cp->cp_xmit_sg == rm->data.op_nents)) cp->cp_xmit_data_sent = 1; } /* * A rm will only take multiple times through this loop * if there is a data op. Thus, if the data is sent (or there was * none), then we're done with the rm. */ if (!rm->data.op_active || cp->cp_xmit_data_sent) { cp->cp_xmit_rm = NULL; cp->cp_xmit_sg = 0; cp->cp_xmit_hdr_off = 0; cp->cp_xmit_data_off = 0; cp->cp_xmit_rdma_sent = 0; cp->cp_xmit_atomic_sent = 0; cp->cp_xmit_data_sent = 0; rds_message_put(rm); } } over_batch: if (conn->c_trans->xmit_path_complete) conn->c_trans->xmit_path_complete(cp); release_in_xmit(cp); /* Nuke any messages we decided not to retransmit. */ if (!list_empty(&to_be_dropped)) { /* irqs on here, so we can put(), unlike above */ list_for_each_entry(rm, &to_be_dropped, m_conn_item) rds_message_put(rm); rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } /* * Other senders can queue a message after we last test the send queue * but before we clear RDS_IN_XMIT. In that case they'd back off and * not try and send their newly queued message. We need to check the * send queue after having cleared RDS_IN_XMIT so that their message * doesn't get stuck on the send queue. * * If the transport cannot continue (i.e ret != 0), then it must * call us when more room is available, such as from the tx * completion handler. * * We have an extra generation check here so that if someone manages * to jump in after our release_in_xmit, we'll see that they have done * some work and we will skip our goto */ if (ret == 0) { bool raced; smp_mb(); raced = send_gen != READ_ONCE(cp->cp_send_gen); if ((test_bit(0, &conn->c_map_queued) || !list_empty(&cp->cp_send_queue)) && !raced) { if (batch_count < send_batch_count) goto restart; rcu_read_lock(); if (rds_destroy_pending(cp->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rcu_read_unlock(); } else if (raced) { rds_stats_inc(s_send_lock_queue_raced); } } out: return ret; } EXPORT_SYMBOL_GPL(rds_send_xmit); static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) { u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); assert_spin_locked(&rs->rs_lock); BUG_ON(rs->rs_snd_bytes < len); rs->rs_snd_bytes -= len; if (rs->rs_snd_bytes == 0) rds_stats_inc(s_send_queue_empty); } static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, is_acked_func is_acked) { if (is_acked) return is_acked(rm, ack); return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; } /* * This is pretty similar to what happens below in the ACK * handling code - except that we call here as soon as we get * the IB send completion on the RDMA op and the accompanying * message. */ void rds_rdma_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; spin_lock_irqsave(&rm->m_rs_lock, flags); ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && ro->op_active && ro->op_notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; spin_lock(&rs->rs_lock); list_add_tail(&notifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); ro->op_notifier = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_rdma_send_complete); /* * Just like above, except looks at atomic op */ void rds_atomic_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; struct rm_atomic_op *ao; struct rds_notifier *notifier; unsigned long flags; spin_lock_irqsave(&rm->m_rs_lock, flags); ao = &rm->atomic; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && ao->op_active && ao->op_notify && ao->op_notifier) { notifier = ao->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; spin_lock(&rs->rs_lock); list_add_tail(&notifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); ao->op_notifier = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_atomic_send_complete); /* * This is the same as rds_rdma_send_complete except we * don't do any locking - we have all the ingredients (message, * socket, socket lock) and can just move the notifier. */ static inline void __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { struct rm_rdma_op *ro; struct rm_atomic_op *ao; ro = &rm->rdma; if (ro->op_active && ro->op_notify && ro->op_notifier) { ro->op_notifier->n_status = status; list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); ro->op_notifier = NULL; } ao = &rm->atomic; if (ao->op_active && ao->op_notify && ao->op_notifier) { ao->op_notifier->n_status = status; list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); ao->op_notifier = NULL; } /* No need to wake the app - caller does this */ } /* * This removes messages from the socket's list if they're on it. The list * argument must be private to the caller, we must be able to modify it * without locks. The messages must have a reference held for their * position on the list. This function will drop that reference after * removing the messages from the 'messages' list regardless of if it found * the messages on the socket list or not. */ static void rds_send_remove_from_sock(struct list_head *messages, int status) { unsigned long flags; struct rds_sock *rs = NULL; struct rds_message *rm; while (!list_empty(messages)) { int was_on_sock = 0; rm = list_entry(messages->next, struct rds_message, m_conn_item); list_del_init(&rm->m_conn_item); /* * If we see this flag cleared then we're *sure* that someone * else beat us to removing it from the sock. If we race * with their flag update we'll get the lock and then really * see that the flag has been cleared. * * The message spinlock makes sure nobody clears rm->m_rs * while we're messing with it. It does not prevent the * message from being removed from the socket, though. */ spin_lock_irqsave(&rm->m_rs_lock, flags); if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) goto unlock_and_drop; if (rs != rm->m_rs) { if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } rs = rm->m_rs; if (rs) sock_hold(rds_rs_to_sk(rs)); } if (!rs) goto unlock_and_drop; spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { struct rm_rdma_op *ro = &rm->rdma; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); if (ro->op_active && ro->op_notifier && (ro->op_notify || (ro->op_recverr && status))) { notifier = ro->op_notifier; list_add_tail(&notifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; rm->rdma.op_notifier = NULL; } was_on_sock = 1; } spin_unlock(&rs->rs_lock); unlock_and_drop: spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); if (was_on_sock) rds_message_put(rm); } if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } /* * Transports call here when they've determined that the receiver queued * messages up to, and including, the given sequence number. Messages are * moved to the retrans queue when rds_send_xmit picks them off the send * queue. This means that in the TCP case, the message may not have been * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked * checks the RDS_MSG_HAS_ACK_SEQ bit. */ void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked) { struct rds_message *rm, *tmp; unsigned long flags; LIST_HEAD(list); spin_lock_irqsave(&cp->cp_lock, flags); list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { if (!rds_send_is_acked(rm, ack, is_acked)) break; list_move(&rm->m_conn_item, &list); clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); } /* order flag updates with spin locks */ if (!list_empty(&list)) smp_mb__after_atomic(); spin_unlock_irqrestore(&cp->cp_lock, flags); /* now remove the messages from the sock list as needed */ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); } EXPORT_SYMBOL_GPL(rds_send_path_drop_acked); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked) { WARN_ON(conn->c_trans->t_mp_capable); rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked); } EXPORT_SYMBOL_GPL(rds_send_drop_acked); void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; struct rds_conn_path *cp; unsigned long flags; LIST_HEAD(list); /* get all the messages we're dropping under the rs lock */ spin_lock_irqsave(&rs->rs_lock, flags); list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { if (dest && (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) || dest->sin6_port != rm->m_inc.i_hdr.h_dport)) continue; list_move(&rm->m_sock_item, &list); rds_send_sndbuf_remove(rs, rm); clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); } /* order flag updates with the rs lock */ smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); if (list_empty(&list)) return; /* Remove the messages from the conn */ list_for_each_entry(rm, &list, m_sock_item) { conn = rm->m_inc.i_conn; if (conn->c_trans->t_mp_capable) cp = rm->m_inc.i_conn_path; else cp = &conn->c_path[0]; spin_lock_irqsave(&cp->cp_lock, flags); /* * Maybe someone else beat us to removing rm from the conn. * If we race with their flag update we'll get the lock and * then really see that the flag has been cleared. */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } list_del_init(&rm->m_conn_item); spin_unlock_irqrestore(&cp->cp_lock, flags); /* * Couldn't grab m_rs_lock in top loop (lock ordering), * but we can now. */ spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); } rds_wake_sk_sleep(rs); while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); list_del_init(&rm->m_sock_item); rds_message_wait(rm); /* just in case the code above skipped this message * because RDS_MSG_ON_CONN wasn't set, run it again here * taking m_rs_lock is the only thing that keeps us * from racing with ack processing. */ spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); } } /* * we only want this to fire once so we use the callers 'queued'. It's * possible that another thread can race with us and remove the * message from the flow with RDS_CANCEL_SENT_TO. */ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, struct rds_conn_path *cp, struct rds_message *rm, __be16 sport, __be16 dport, int *queued) { unsigned long flags; u32 len; if (*queued) goto out; len = be32_to_cpu(rm->m_inc.i_hdr.h_len); /* this is the only place which holds both the socket's rs_lock * and the connection's c_lock */ spin_lock_irqsave(&rs->rs_lock, flags); /* * If there is a little space in sndbuf, we don't queue anything, * and userspace gets -EAGAIN. But poll() indicates there's send * room. This can lead to bad behavior (spinning) if snd_bytes isn't * freed up by incoming acks. So we check the *old* value of * rs_snd_bytes here to allow the last msg to exceed the buffer, * and poll() now knows no more data can be sent. */ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { rs->rs_snd_bytes += len; /* let recv side know we are close to send space exhaustion. * This is probably not the optimal way to do it, as this * means we set the flag on *all* messages as soon as our * throughput hits a certain threshold. */ if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); rds_message_addref(rm); sock_hold(rds_rs_to_sk(rs)); rm->m_rs = rs; /* The code ordering is a little weird, but we're trying to minimize the time we hold c_lock */ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); rm->m_inc.i_conn = conn; rm->m_inc.i_conn_path = cp; rds_message_addref(rm); spin_lock(&cp->cp_lock); rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++); list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); spin_unlock(&cp->cp_lock); rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", rm, len, rs, rs->rs_snd_bytes, (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); *queued = 1; } spin_unlock_irqrestore(&rs->rs_lock, flags); out: return *queued; } /* * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ static int rds_rm_size(struct msghdr *msg, int num_sgs, struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; bool zcopy_cookie = false; struct rds_iov_vector *iov, *tmp_iov; if (num_sgs < 0) return -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: if (vct->indx >= vct->len) { vct->len += vct->incr; tmp_iov = krealloc(vct->vec, vct->len * sizeof(struct rds_iov_vector), GFP_KERNEL); if (!tmp_iov) { vct->len -= vct->incr; return -ENOMEM; } vct->vec = tmp_iov; } iov = &vct->vec[vct->indx]; memset(iov, 0, sizeof(struct rds_iov_vector)); vct->indx++; cmsg_groups |= 1; retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov); if (retval < 0) return retval; size += retval; break; case RDS_CMSG_ZCOPY_COOKIE: zcopy_cookie = true; fallthrough; case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; /* these are valid but do no add any size */ break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: case RDS_CMSG_MASKED_ATOMIC_CSWP: case RDS_CMSG_MASKED_ATOMIC_FADD: cmsg_groups |= 1; size += sizeof(struct scatterlist); break; default: return -EINVAL; } } if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) return -EINVAL; size += num_sgs * sizeof(struct scatterlist); /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ if (cmsg_groups == 3) return -EINVAL; return size; } static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { u32 *cookie; if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) || !rm->data.op_mmp_znotifier) return -EINVAL; cookie = CMSG_DATA(cmsg); rm->data.op_mmp_znotifier->z_cookie = *cookie; return 0; } static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr, struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; int ret = 0, ind = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; /* As a side effect, RDMA_DEST and RDMA_MAP will set * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. */ switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: if (ind >= vct->indx) return -ENOMEM; ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]); ind++; break; case RDS_CMSG_RDMA_DEST: ret = rds_cmsg_rdma_dest(rs, rm, cmsg); break; case RDS_CMSG_RDMA_MAP: ret = rds_cmsg_rdma_map(rs, rm, cmsg); if (!ret) *allocated_mr = 1; else if (ret == -ENODEV) /* Accommodate the get_mr() case which can fail * if connection isn't established yet. */ ret = -EAGAIN; break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: case RDS_CMSG_MASKED_ATOMIC_CSWP: case RDS_CMSG_MASKED_ATOMIC_FADD: ret = rds_cmsg_atomic(rs, rm, cmsg); break; case RDS_CMSG_ZCOPY_COOKIE: ret = rds_cmsg_zcopy(rs, rm, cmsg); break; default: return -EINVAL; } if (ret) break; } return ret; } static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn, int nonblock) { int hash; if (conn->c_npaths == 0) hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS); else hash = RDS_MPATH_HASH(rs, conn->c_npaths); if (conn->c_npaths == 0 && hash != 0) { rds_send_ping(conn, 0); /* The underlying connection is not up yet. Need to wait * until it is up to be sure that the non-zero c_path can be * used. But if we are interrupted, we have to use the zero * c_path in case the connection ends up being non-MP capable. */ if (conn->c_npaths == 0) { /* Cannot wait for the connection be made, so just use * the base c_path. */ if (nonblock) return 0; if (wait_event_interruptible(conn->c_hs_waitq, conn->c_npaths != 0)) hash = 0; } if (conn->c_npaths == 1) hash = 0; } return hash; } static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) { struct rds_rdma_args *args; struct cmsghdr *cmsg; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))) return -EINVAL; args = CMSG_DATA(cmsg); *rdma_bytes += args->remote_vec.bytes; } } return 0; } int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); __be16 dport; struct rds_message *rm = NULL; struct rds_connection *conn; int ret = 0; int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; struct in6_addr daddr; __u32 scope_id = 0; size_t total_payload_len = payload_len, rdma_payload_len = 0; bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE); int namelen; struct rds_iov_vector_arr vct; int ind; memset(&vct, 0, sizeof(vct)); /* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */ vct.incr = 1; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { ret = -EOPNOTSUPP; goto out; } namelen = msg->msg_namelen; if (namelen != 0) { if (namelen < sizeof(*usin)) { ret = -EINVAL; goto out; } switch (usin->sin_family) { case AF_INET: if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || ipv4_is_multicast(usin->sin_addr.s_addr)) { ret = -EINVAL; goto out; } ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr); dport = usin->sin_port; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { int addr_type; if (namelen < sizeof(*sin6)) { ret = -EINVAL; goto out; } addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) { ret = -EINVAL; goto out; } /* It is a mapped address. Need to do some * sanity checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) { ret = -EINVAL; goto out; } } if (addr_type & IPV6_ADDR_LINKLOCAL) { if (sin6->sin6_scope_id == 0) { ret = -EINVAL; goto out; } scope_id = sin6->sin6_scope_id; } daddr = sin6->sin6_addr; dport = sin6->sin6_port; break; } #endif default: ret = -EINVAL; goto out; } } else { /* We only care about consistency with ->connect() */ lock_sock(sk); daddr = rs->rs_conn_addr; dport = rs->rs_conn_port; scope_id = rs->rs_bound_scope_id; release_sock(sk); } lock_sock(sk); if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) { release_sock(sk); ret = -ENOTCONN; goto out; } else if (namelen != 0) { /* Cannot send to an IPv4 address using an IPv6 source * address and cannot send to an IPv6 address using an * IPv4 source address. */ if (ipv6_addr_v4mapped(&daddr) ^ ipv6_addr_v4mapped(&rs->rs_bound_addr)) { release_sock(sk); ret = -EOPNOTSUPP; goto out; } /* If the socket is already bound to a link local address, * it can only send to peers on the same link. But allow * communicating between link local and non-link local address. */ if (scope_id != rs->rs_bound_scope_id) { if (!scope_id) { scope_id = rs->rs_bound_scope_id; } else if (rs->rs_bound_scope_id) { release_sock(sk); ret = -EINVAL; goto out; } } } release_sock(sk); ret = rds_rdma_bytes(msg, &rdma_payload_len); if (ret) goto out; total_payload_len += rdma_payload_len; if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) { ret = -EMSGSIZE; goto out; } if (payload_len > rds_sk_sndbuf(rs)) { ret = -EMSGSIZE; goto out; } if (zcopy) { if (rs->rs_transport->t_type != RDS_TRANS_TCP) { ret = -EOPNOTSUPP; goto out; } num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); } /* size of rm including all sgs */ ret = rds_rm_size(msg, num_sgs, &vct); if (ret < 0) goto out; rm = rds_message_alloc(ret, GFP_KERNEL); if (!rm) { ret = -ENOMEM; goto out; } /* Attach data to the rm */ if (payload_len) { rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (IS_ERR(rm->data.op_sg)) { ret = PTR_ERR(rm->data.op_sg); goto out; } ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; } rm->data.op_active = 1; rm->m_daddr = daddr; /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) && rs->rs_tos == rs->rs_conn->c_tos) { conn = rs->rs_conn; } else { conn = rds_conn_create_outgoing(sock_net(sock->sk), &rs->rs_bound_addr, &daddr, rs->rs_transport, rs->rs_tos, sock->sk->sk_allocation, scope_id); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; } rs->rs_conn = conn; } if (conn->c_trans->t_mp_capable) cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)]; else cpath = &conn->c_path[0]; rm->m_conn_path = cpath; /* Parse any control messages the user may have included. */ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct); if (ret) { /* Trigger connection so that its ready for the next retry */ if (ret == -EAGAIN) rds_conn_connect_if_down(conn); goto out; } if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", &rm->rdma, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", &rm->atomic, conn->c_trans->xmit_atomic); ret = -EOPNOTSUPP; goto out; } if (rds_destroy_pending(conn)) { ret = -EAGAIN; goto out; } if (rds_conn_path_down(cpath)) rds_check_all_paths(conn); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { rs->rs_seen_congestion = 1; goto out; } while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); if (nonblock) { ret = -EAGAIN; goto out; } timeo = wait_event_interruptible_timeout(*sk_sleep(sk), rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued), timeo); rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) continue; ret = timeo; if (ret == 0) ret = -ETIMEDOUT; goto out; } /* * By now we've committed to the send. We reuse rds_send_worker() * to retry sends in the rds thread if the transport asks us to. */ rds_stats_inc(s_send_queued); ret = rds_send_xmit(cpath); if (ret == -ENOMEM || ret == -EAGAIN) { ret = 0; rcu_read_lock(); if (rds_destroy_pending(cpath->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); rcu_read_unlock(); } if (ret) goto out; rds_message_put(rm); for (ind = 0; ind < vct.indx; ind++) kfree(vct.vec[ind].iov); kfree(vct.vec); return payload_len; out: for (ind = 0; ind < vct.indx; ind++) kfree(vct.vec[ind].iov); kfree(vct.vec); /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN * or in any other way, we need to destroy the MR again */ if (allocated_mr) rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); if (rm) rds_message_put(rm); return ret; } /* * send out a probe. Can be shared by rds_send_ping, * rds_send_pong, rds_send_hb. * rds_send_hb should use h_flags * RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED * or * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED */ static int rds_send_probe(struct rds_conn_path *cp, __be16 sport, __be16 dport, u8 h_flags) { struct rds_message *rm; unsigned long flags; int ret = 0; rm = rds_message_alloc(0, GFP_ATOMIC); if (!rm) { ret = -ENOMEM; goto out; } rm->m_daddr = cp->cp_conn->c_faddr; rm->data.op_active = 1; rds_conn_path_connect_if_down(cp); ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL); if (ret) goto out; spin_lock_irqsave(&cp->cp_lock, flags); list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); rds_message_addref(rm); rm->m_inc.i_conn = cp->cp_conn; rm->m_inc.i_conn_path = cp; rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, cp->cp_next_tx_seq); rm->m_inc.i_hdr.h_flags |= h_flags; cp->cp_next_tx_seq++; if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) && cp->cp_conn->c_trans->t_mp_capable) { u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, sizeof(npaths)); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_GEN_NUM, &my_gen_num, sizeof(u32)); } spin_unlock_irqrestore(&cp->cp_lock, flags); rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); /* schedule the send work on rds_wq */ rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rcu_read_unlock(); rds_message_put(rm); return 0; out: if (rm) rds_message_put(rm); return ret; } int rds_send_pong(struct rds_conn_path *cp, __be16 dport) { return rds_send_probe(cp, 0, dport, 0); } void rds_send_ping(struct rds_connection *conn, int cp_index) { unsigned long flags; struct rds_conn_path *cp = &conn->c_path[cp_index]; spin_lock_irqsave(&cp->cp_lock, flags); if (conn->c_ping_triggered) { spin_unlock_irqrestore(&cp->cp_lock, flags); return; } conn->c_ping_triggered = 1; spin_unlock_irqrestore(&cp->cp_lock, flags); rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0); } EXPORT_SYMBOL_GPL(rds_send_ping);
3 3 17 16 14 14 6 6 11 2 9 3 1 2 11 1 2 11 6 3 3 10 2 9 14 9 9 7 2 9 9 9 9 11 11 10 4 9 11 2 11 9 5 5 1 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 /* * net/tipc/group.c: TIPC group messaging code * * Copyright (c) 2017, Ericsson AB * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "addr.h" #include "group.h" #include "bcast.h" #include "topsrv.h" #include "msg.h" #include "socket.h" #include "node.h" #include "name_table.h" #include "subscr.h" #define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) #define ADV_IDLE ADV_UNIT #define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { MBR_JOINING, MBR_PUBLISHED, MBR_JOINED, MBR_PENDING, MBR_ACTIVE, MBR_RECLAIMING, MBR_REMITTED, MBR_LEAVING }; struct tipc_member { struct rb_node tree_node; struct list_head list; struct list_head small_win; struct sk_buff_head deferredq; struct tipc_group *group; u32 node; u32 port; u32 instance; enum mbr_state state; u16 advertised; u16 window; u16 bc_rcv_nxt; u16 bc_syncpt; u16 bc_acked; }; struct tipc_group { struct rb_root members; struct list_head small_win; struct list_head pending; struct list_head active; struct tipc_nlist dests; struct net *net; int subid; u32 type; u32 instance; u32 scope; u32 portid; u16 member_cnt; u16 active_cnt; u16 max_active; u16 bc_snd_nxt; u16 bc_ackers; bool *open; bool loopback; bool events; }; static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); static void tipc_group_open(struct tipc_member *m, bool *wakeup) { *wakeup = false; if (list_empty(&m->small_win)) return; list_del_init(&m->small_win); *m->group->open = true; *wakeup = true; } static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING || m->state == MBR_REMITTED) grp->active_cnt--; } static int tipc_group_rcvbuf_limit(struct tipc_group *grp) { int max_active, active_pool, idle_pool; int mcnt = grp->member_cnt + 1; /* Limit simultaneous reception from other members */ max_active = min(mcnt / 8, 64); max_active = max(max_active, 16); grp->max_active = max_active; /* Reserve blocks for active and idle members */ active_pool = max_active * ADV_ACTIVE; idle_pool = (mcnt - max_active) * ADV_IDLE; /* Scale to bytes, considering worst-case truesize/msgsize ratio */ return (active_pool + idle_pool) * FLOWCTL_BLK_SZ * 4; } u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) { return grp->bc_snd_nxt; } static bool tipc_group_is_receiver(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_LEAVING; } static bool tipc_group_is_sender(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_PUBLISHED; } u32 tipc_group_exclude(struct tipc_group *grp) { if (!grp->loopback) return grp->portid; return 0; } struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq, bool *group_is_open) { u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS; bool global = mreq->scope != TIPC_NODE_SCOPE; struct tipc_group *grp; u32 type = mreq->type; grp = kzalloc(sizeof(*grp), GFP_ATOMIC); if (!grp) return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); INIT_LIST_HEAD(&grp->small_win); INIT_LIST_HEAD(&grp->active); INIT_LIST_HEAD(&grp->pending); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; grp->type = type; grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; grp->open = group_is_open; *grp->open = false; filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, filter, &grp->subid)) return grp; kfree(grp); return NULL; } void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq); tipc_group_update_member(m, 0); } tipc_node_distr_xmit(net, &xmitq); *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } void tipc_group_delete(struct net *net, struct tipc_group *grp) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq); __skb_queue_purge(&m->deferredq); list_del(&m->list); kfree(m); } tipc_node_distr_xmit(net, &xmitq); tipc_nlist_purge(&grp->dests); tipc_topsrv_kern_unsubscr(net, grp->subid); kfree(grp); } static struct tipc_member *tipc_group_find_member(struct tipc_group *grp, u32 node, u32 port) { struct rb_node *n = grp->members.rb_node; u64 nkey, key = (u64)node << 32 | port; struct tipc_member *m; while (n) { m = container_of(n, struct tipc_member, tree_node); nkey = (u64)m->node << 32 | m->port; if (key < nkey) n = n->rb_left; else if (key > nkey) n = n->rb_right; else return m; } return NULL; } static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, u32 node, u32 port) { struct tipc_member *m; m = tipc_group_find_member(grp, node, port); if (m && tipc_group_is_receiver(m)) return m; return NULL; } static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, u32 node) { struct tipc_member *m; struct rb_node *n; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (m->node == node) return m; } return NULL; } static int tipc_group_add_to_tree(struct tipc_group *grp, struct tipc_member *m) { u64 nkey, key = (u64)m->node << 32 | m->port; struct rb_node **n, *parent = NULL; struct tipc_member *tmp; n = &grp->members.rb_node; while (*n) { tmp = container_of(*n, struct tipc_member, tree_node); parent = *n; tmp = container_of(parent, struct tipc_member, tree_node); nkey = (u64)tmp->node << 32 | tmp->port; if (key < nkey) n = &(*n)->rb_left; else if (key > nkey) n = &(*n)->rb_right; else return -EEXIST; } rb_link_node(&m->tree_node, parent, n); rb_insert_color(&m->tree_node, &grp->members); return 0; } static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, u32 node, u32 port, u32 instance, int state) { struct tipc_member *m; int ret; m = kzalloc(sizeof(*m), GFP_ATOMIC); if (!m) return NULL; INIT_LIST_HEAD(&m->list); INIT_LIST_HEAD(&m->small_win); __skb_queue_head_init(&m->deferredq); m->group = grp; m->node = node; m->port = port; m->instance = instance; m->bc_acked = grp->bc_snd_nxt - 1; ret = tipc_group_add_to_tree(grp, m); if (ret < 0) { kfree(m); return NULL; } grp->member_cnt++; tipc_nlist_add(&grp->dests, m->node); m->state = state; return m; } void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port, u32 instance) { tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); } static void tipc_group_delete_member(struct tipc_group *grp, struct tipc_member *m) { rb_erase(&m->tree_node, &grp->members); grp->member_cnt--; /* Check if we were waiting for replicast ack from this member */ if (grp->bc_ackers && less(m->bc_acked, grp->bc_snd_nxt - 1)) grp->bc_ackers--; list_del_init(&m->list); list_del_init(&m->small_win); tipc_group_decr_active(grp, m); /* If last member on a node, remove node from dest list */ if (!tipc_group_find_node(grp, m->node)) tipc_nlist_del(&grp->dests, m->node); kfree(m); } struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) { return &grp->dests; } void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope) { seq->type = grp->type; seq->lower = grp->instance; seq->upper = grp->instance; *scope = grp->scope; } void tipc_group_update_member(struct tipc_member *m, int len) { struct tipc_group *grp = m->group; struct tipc_member *_m, *tmp; if (!tipc_group_is_receiver(m)) return; m->window -= len; if (m->window >= ADV_IDLE) return; list_del_init(&m->small_win); /* Sort member into small_window members' list */ list_for_each_entry_safe(_m, tmp, &grp->small_win, small_win) { if (_m->window > m->window) break; } list_add_tail(&m->small_win, &_m->small_win); } void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) { u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; u16 ackers = 0; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (tipc_group_is_receiver(m)) { tipc_group_update_member(m, len); m->bc_acked = prev; ackers++; } } /* Mark number of acknowledges to expect, if any */ if (ack) grp->bc_ackers = ackers; grp->bc_snd_nxt++; } bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int len, struct tipc_member **mbr) { struct sk_buff_head xmitq; struct tipc_member *m; int adv, state; m = tipc_group_find_dest(grp, dnode, dport); if (!tipc_group_is_receiver(m)) { *mbr = NULL; return false; } *mbr = m; if (m->window >= len) return false; *grp->open = false; /* If not fully advertised, do it now to prevent mutual blocking */ adv = m->advertised; state = m->state; if (state == MBR_JOINED && adv == ADV_IDLE) return true; if (state == MBR_ACTIVE && adv == ADV_ACTIVE) return true; if (state == MBR_PENDING && adv == ADV_IDLE) return true; __skb_queue_head_init(&xmitq); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); tipc_node_distr_xmit(grp->net, &xmitq); return true; } bool tipc_group_bc_cong(struct tipc_group *grp, int len) { struct tipc_member *m = NULL; /* If prev bcast was replicast, reject until all receivers have acked */ if (grp->bc_ackers) { *grp->open = false; return true; } if (list_empty(&grp->small_win)) return false; m = list_first_entry(&grp->small_win, struct tipc_member, small_win); if (m->window >= len) return false; return tipc_group_cong(grp, m->node, m->port, len, &m); } /* tipc_group_sort_msg() - sort msg into queue by bcast sequence number */ static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) { struct tipc_msg *_hdr, *hdr = buf_msg(skb); u16 bc_seqno = msg_grp_bc_seqno(hdr); struct sk_buff *_skb, *tmp; int mtyp = msg_type(hdr); /* Bcast/mcast may be bypassed by ucast or other bcast, - sort it in */ if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); if (!less(bc_seqno, msg_grp_bc_seqno(_hdr))) continue; __skb_queue_before(defq, _skb, skb); return; } /* Bcast was not bypassed, - add to tail */ } /* Unicasts are never bypassed, - always add to tail */ __skb_queue_tail(defq, skb); } /* tipc_group_filter_msg() - determine if we should accept arriving message */ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); bool ack, deliver, update, leave = false; struct sk_buff_head *defq; struct tipc_member *m; struct tipc_msg *hdr; u32 node, port; int mtyp, blks; if (!skb) return; hdr = buf_msg(skb); node = msg_orignode(hdr); port = msg_origport(hdr); if (!msg_in_group(hdr)) goto drop; m = tipc_group_find_member(grp, node, port); if (!tipc_group_is_sender(m)) goto drop; if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) goto drop; TIPC_SKB_CB(skb)->orig_member = m->instance; defq = &m->deferredq; tipc_group_sort_msg(skb, defq); while ((skb = skb_peek(defq))) { hdr = buf_msg(skb); mtyp = msg_type(hdr); blks = msg_blocks(hdr); deliver = true; ack = false; update = false; if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) break; /* Decide what to do with message */ switch (mtyp) { case TIPC_GRP_MCAST_MSG: if (msg_nameinst(hdr) != grp->instance) { update = true; deliver = false; } fallthrough; case TIPC_GRP_BCAST_MSG: m->bc_rcv_nxt++; ack = msg_grp_bc_ack_req(hdr); break; case TIPC_GRP_UCAST_MSG: break; case TIPC_GRP_MEMBER_EVT: if (m->state == MBR_LEAVING) leave = true; if (!grp->events) deliver = false; break; default: break; } /* Execute decisions */ __skb_dequeue(defq); if (deliver) __skb_queue_tail(inputq, skb); else kfree_skb(skb); if (ack) tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); if (leave) { __skb_queue_purge(defq); tipc_group_delete_member(grp, m); break; } if (!update) continue; tipc_group_update_rcv_win(grp, blks, node, port, xmitq); } return; drop: kfree_skb(skb); } void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq) { struct list_head *active = &grp->active; int max_active = grp->max_active; int reclaim_limit = max_active * 3 / 4; int active_cnt = grp->active_cnt; struct tipc_member *m, *rm, *pm; m = tipc_group_find_member(grp, node, port); if (!m) return; m->advertised -= blks; switch (m->state) { case MBR_JOINED: /* First, decide if member can go active */ if (active_cnt <= max_active) { m->state = MBR_ACTIVE; list_add_tail(&m->list, active); grp->active_cnt++; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } else { m->state = MBR_PENDING; list_add_tail(&m->list, &grp->pending); } if (active_cnt < reclaim_limit) break; /* Reclaim from oldest active member, if possible */ if (!list_empty(active)) { rm = list_first_entry(active, struct tipc_member, list); rm->state = MBR_RECLAIMING; list_del_init(&rm->list); tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); break; } /* Nobody to reclaim from; - revert oldest pending to JOINED */ pm = list_first_entry(&grp->pending, struct tipc_member, list); list_del_init(&pm->list); pm->state = MBR_JOINED; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_ACTIVE: if (!list_is_last(&m->list, &grp->active)) list_move_tail(&m->list, &grp->active); if (m->advertised > (ADV_ACTIVE * 3 / 4)) break; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); break; case MBR_REMITTED: if (m->advertised > ADV_IDLE) break; m->state = MBR_JOINED; grp->active_cnt--; if (m->advertised < ADV_IDLE) { pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } if (list_empty(&grp->pending)) return; /* Set oldest pending member to active and advertise */ pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: case MBR_JOINING: case MBR_LEAVING: default: break; } } static void tipc_group_create_event(struct tipc_group *grp, struct tipc_member *m, u32 event, u16 seqno, struct sk_buff_head *inputq) { u32 dnode = tipc_own_addr(grp->net); struct tipc_event evt; struct sk_buff *skb; struct tipc_msg *hdr; memset(&evt, 0, sizeof(evt)); evt.event = event; evt.found_lower = m->instance; evt.found_upper = m->instance; evt.port.ref = m->port; evt.port.node = m->node; evt.s.seq.type = grp->type; evt.s.seq.lower = m->instance; evt.s.seq.upper = m->instance; skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_GRP_MEMBER_EVT, GROUP_H_SIZE, sizeof(evt), dnode, m->node, grp->portid, m->port, 0); if (!skb) return; hdr = buf_msg(skb); msg_set_nametype(hdr, grp->type); msg_set_grp_evt(hdr, event); msg_set_dest_droppable(hdr, true); msg_set_grp_bc_seqno(hdr, seqno); memcpy(msg_data(hdr), &evt, sizeof(evt)); TIPC_SKB_CB(skb)->orig_member = m->instance; __skb_queue_tail(inputq, skb); } static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq) { struct tipc_msg *hdr; struct sk_buff *skb; int adv = 0; skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, m->node, tipc_own_addr(grp->net), m->port, grp->portid, 0); if (!skb) return; if (m->state == MBR_ACTIVE) adv = ADV_ACTIVE - m->advertised; else if (m->state == MBR_JOINED || m->state == MBR_PENDING) adv = ADV_IDLE - m->advertised; hdr = buf_msg(skb); if (mtyp == GRP_JOIN_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_LEAVE_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); } else if (mtyp == GRP_ADV_MSG) { msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_ACK_MSG) { msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); } else if (mtyp == GRP_REMIT_MSG) { msg_set_grp_remitted(hdr, m->window); } msg_set_dest_droppable(hdr, true); __skb_queue_tail(xmitq, skb); } void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); struct tipc_member *m, *pm; u16 remitted, in_flight; if (!grp) return; if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net)) return; m = tipc_group_find_member(grp, node, port); switch (msg_type(hdr)) { case GRP_JOIN_MSG: if (!m) m = tipc_group_create_member(grp, node, port, 0, MBR_JOINING); if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); m->bc_rcv_nxt = m->bc_syncpt; m->window += msg_adv_win(hdr); /* Wait until PUBLISH event is received if necessary */ if (m->state != MBR_PUBLISHED) return; /* Member can be taken into service */ m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); return; case GRP_LEAVE_MSG: if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); list_del_init(&m->list); tipc_group_open(m, usr_wakeup); tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_syncpt, inputq); return; case GRP_ADV_MSG: if (!m) return; m->window += msg_adv_win(hdr); tipc_group_open(m, usr_wakeup); return; case GRP_ACK_MSG: if (!m) return; m->bc_acked = msg_grp_bc_acked(hdr); if (--grp->bc_ackers) return; list_del_init(&m->small_win); *m->group->open = true; *usr_wakeup = true; tipc_group_update_member(m, 0); return; case GRP_RECLAIM_MSG: if (!m) return; tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); m->window = ADV_IDLE; tipc_group_open(m, usr_wakeup); return; case GRP_REMIT_MSG: if (!m || m->state != MBR_RECLAIMING) return; remitted = msg_grp_remitted(hdr); /* Messages preceding the REMIT still in receive queue */ if (m->advertised > remitted) { m->state = MBR_REMITTED; in_flight = m->advertised - remitted; m->advertised = ADV_IDLE + in_flight; return; } /* This should never happen */ if (m->advertised < remitted) pr_warn_ratelimited("Unexpected REMIT msg\n"); /* All messages preceding the REMIT have been read */ m->state = MBR_JOINED; grp->active_cnt--; m->advertised = ADV_IDLE; /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) return; pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; if (pm->advertised <= (ADV_ACTIVE * 3 / 4)) tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); return; default: pr_warn("Received unknown GROUP_PROTO message\n"); } } /* tipc_group_member_evt() - receive and handle a member up/down event */ void tipc_group_member_evt(struct tipc_group *grp, bool *usr_wakeup, int *sk_rcvbuf, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_event *evt = (void *)msg_data(hdr); u32 instance = evt->found_lower; u32 node = evt->port.node; u32 port = evt->port.ref; int event = evt->event; struct tipc_member *m; struct net *net; u32 self; if (!grp) return; net = grp->net; self = tipc_own_addr(net); if (!grp->loopback && node == self && port == grp->portid) return; m = tipc_group_find_member(grp, node, port); switch (event) { case TIPC_PUBLISHED: /* Send and wait for arrival of JOIN message if necessary */ if (!m) { m = tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); if (!m) break; tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); break; } if (m->state != MBR_JOINING) break; /* Member can be taken into service */ m->instance = instance; m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); break; case TIPC_WITHDRAWN: if (!m) break; tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; list_del_init(&m->list); tipc_group_open(m, usr_wakeup); /* Only send event if no LEAVE message can be expected */ if (!tipc_node_is_up(net, node)) tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_rcv_nxt, inputq); break; default: break; } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) { struct nlattr *group = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_GROUP); if (!group) return -EMSGSIZE; if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID, grp->type) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE, grp->instance) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, grp->bc_snd_nxt)) goto group_msg_cancel; if (grp->scope == TIPC_NODE_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE)) goto group_msg_cancel; if (grp->scope == TIPC_CLUSTER_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE)) goto group_msg_cancel; if (*grp->open) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN)) goto group_msg_cancel; nla_nest_end(skb, group); return 0; group_msg_cancel: nla_nest_cancel(skb, group); return -1; }
564 564 564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module for IP set management */ #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/rculist.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/ipset/ip_set.h> static LIST_HEAD(ip_set_type_list); /* all registered set types */ static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ struct ip_set_net { struct ip_set * __rcu *ip_set_list; /* all individual sets */ ip_set_id_t ip_set_max; /* max number of sets */ bool is_deleted; /* deleted by ip_set_net_exit */ bool is_destroyed; /* all sets are destroyed */ }; static unsigned int ip_set_net_id __read_mostly; static struct ip_set_net *ip_set_pernet(struct net *net) { return net_generic(net, ip_set_net_id); } #define IP_SET_INC 64 #define STRNCMP(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) static unsigned int max_sets; module_param(max_sets, int, 0600); MODULE_PARM_DESC(max_sets, "maximal number of sets"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex or ip_set_ref_lock is held: */ #define ip_set_dereference(p) \ rcu_dereference_protected(p, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ lockdep_is_held(&ip_set_ref_lock)) #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is * serialized by ip_set_type_mutex. */ static void ip_set_type_lock(void) { mutex_lock(&ip_set_type_mutex); } static void ip_set_type_unlock(void) { mutex_unlock(&ip_set_type_mutex); } /* Register and deregister settype */ static struct ip_set_type * find_set_type(const char *name, u8 family, u8 revision) { struct ip_set_type *type; list_for_each_entry_rcu(type, &ip_set_type_list, list, lockdep_is_held(&ip_set_type_mutex)) if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC) && revision >= type->revision_min && revision <= type->revision_max) return type; return NULL; } /* Unlock, try to load a set type module and lock again */ static bool load_settype(const char *name) { nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { pr_warn("Can't find ip_set type %s\n", name); nfnl_lock(NFNL_SUBSYS_IPSET); return false; } nfnl_lock(NFNL_SUBSYS_IPSET); return true; } /* Find a set type and reference it */ #define find_set_type_get(name, family, revision, found) \ __find_set_type_get(name, family, revision, found, false) static int __find_set_type_get(const char *name, u8 family, u8 revision, struct ip_set_type **found, bool retry) { struct ip_set_type *type; int err; if (retry && !load_settype(name)) return -IPSET_ERR_FIND_TYPE; rcu_read_lock(); *found = find_set_type(name, family, revision); if (*found) { err = !try_module_get((*found)->me) ? -EFAULT : 0; goto unlock; } /* Make sure the type is already loaded * but we don't support the revision */ list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STRNCMP(type->name, name)) { err = -IPSET_ERR_FIND_TYPE; goto unlock; } rcu_read_unlock(); return retry ? -IPSET_ERR_FIND_TYPE : __find_set_type_get(name, family, revision, found, true); unlock: rcu_read_unlock(); return err; } /* Find a given set type by name and family. * If we succeeded, the supported minimal and maximum revisions are * filled out. */ #define find_set_type_minmax(name, family, min, max) \ __find_set_type_minmax(name, family, min, max, false) static int __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, bool retry) { struct ip_set_type *type; bool found = false; if (retry && !load_settype(name)) return -IPSET_ERR_FIND_TYPE; *min = 255; *max = 0; rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC)) { found = true; if (type->revision_min < *min) *min = type->revision_min; if (type->revision_max > *max) *max = type->revision_max; } rcu_read_unlock(); if (found) return 0; return retry ? -IPSET_ERR_FIND_TYPE : __find_set_type_minmax(name, family, min, max, true); } #define family_name(f) ((f) == NFPROTO_IPV4 ? "inet" : \ (f) == NFPROTO_IPV6 ? "inet6" : "any") /* Register a set type structure. The type is identified by * the unique triple of name, family and revision. */ int ip_set_type_register(struct ip_set_type *type) { int ret = 0; if (type->protocol != IPSET_PROTOCOL) { pr_warn("ip_set type %s, family %s, revision %u:%u uses wrong protocol version %u (want %u)\n", type->name, family_name(type->family), type->revision_min, type->revision_max, type->protocol, IPSET_PROTOCOL); return -EINVAL; } ip_set_type_lock(); if (find_set_type(type->name, type->family, type->revision_min)) { /* Duplicate! */ pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", type->name, family_name(type->family), type->revision_min); ip_set_type_unlock(); return -EINVAL; } list_add_rcu(&type->list, &ip_set_type_list); pr_debug("type %s, family %s, revision %u:%u registered.\n", type->name, family_name(type->family), type->revision_min, type->revision_max); ip_set_type_unlock(); return ret; } EXPORT_SYMBOL_GPL(ip_set_type_register); /* Unregister a set type. There's a small race with ip_set_create */ void ip_set_type_unregister(struct ip_set_type *type) { ip_set_type_lock(); if (!find_set_type(type->name, type->family, type->revision_min)) { pr_warn("ip_set type %s, family %s with revision min %u not registered\n", type->name, family_name(type->family), type->revision_min); ip_set_type_unlock(); return; } list_del_rcu(&type->list); pr_debug("type %s, family %s with revision min %u unregistered.\n", type->name, family_name(type->family), type->revision_min); ip_set_type_unlock(); synchronize_rcu(); } EXPORT_SYMBOL_GPL(ip_set_type_unregister); /* Utility functions */ void * ip_set_alloc(size_t size) { return kvzalloc(size, GFP_KERNEL_ACCOUNT); } EXPORT_SYMBOL_GPL(ip_set_alloc); void ip_set_free(void *members) { pr_debug("%p: free with %s\n", members, is_vmalloc_addr(members) ? "vfree" : "kfree"); kvfree(members); } EXPORT_SYMBOL_GPL(ip_set_free); static bool flag_nested(const struct nlattr *nla) { return nla->nla_type & NLA_F_NESTED; } static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 }, [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), }; int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) { struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) return -IPSET_ERR_PROTOCOL; *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) { struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) return -IPSET_ERR_PROTOCOL; memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), sizeof(struct in6_addr)); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); static u32 ip_set_timeout_get(const unsigned long *timeout) { u32 t; if (*timeout == IPSET_ELEM_PERMANENT) return 0; t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC; /* Zero value in userspace means no timeout */ return t == 0 ? 1 : t; } static char * ip_set_comment_uget(struct nlattr *tb) { return nla_data(tb); } /* Called from uadd only, protected by the set spinlock. * The kadt functions don't use the comment extensions in any way. */ void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, const struct ip_set_ext *ext) { struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); size_t len = ext->comment ? strlen(ext->comment) : 0; if (unlikely(c)) { set->ext_size -= sizeof(*c) + strlen(c->str) + 1; kfree_rcu(c, rcu); rcu_assign_pointer(comment->c, NULL); } if (!len) return; if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) len = IPSET_MAX_COMMENT_SIZE; c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); if (unlikely(!c)) return; strlcpy(c->str, ext->comment, len + 1); set->ext_size += sizeof(*c) + strlen(c->str) + 1; rcu_assign_pointer(comment->c, c); } EXPORT_SYMBOL_GPL(ip_set_init_comment); /* Used only when dumping a set, protected by rcu_read_lock() */ static int ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) { struct ip_set_comment_rcu *c = rcu_dereference(comment->c); if (!c) return 0; return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); } /* Called from uadd/udel, flush or the garbage collectors protected * by the set spinlock. * Called when the set is destroyed and when there can't be any user * of the set data anymore. */ static void ip_set_comment_free(struct ip_set *set, void *ptr) { struct ip_set_comment *comment = ptr; struct ip_set_comment_rcu *c; c = rcu_dereference_protected(comment->c, 1); if (unlikely(!c)) return; set->ext_size -= sizeof(*c) + strlen(c->str) + 1; kfree_rcu(c, rcu); rcu_assign_pointer(comment->c, NULL); } typedef void (*destroyer)(struct ip_set *, void *); /* ipset data extension types, in size order */ const struct ip_set_ext_type ip_set_extensions[] = { [IPSET_EXT_ID_COUNTER] = { .type = IPSET_EXT_COUNTER, .flag = IPSET_FLAG_WITH_COUNTERS, .len = sizeof(struct ip_set_counter), .align = __alignof__(struct ip_set_counter), }, [IPSET_EXT_ID_TIMEOUT] = { .type = IPSET_EXT_TIMEOUT, .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, [IPSET_EXT_ID_SKBINFO] = { .type = IPSET_EXT_SKBINFO, .flag = IPSET_FLAG_WITH_SKBINFO, .len = sizeof(struct ip_set_skbinfo), .align = __alignof__(struct ip_set_skbinfo), }, [IPSET_EXT_ID_COMMENT] = { .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, .flag = IPSET_FLAG_WITH_COMMENT, .len = sizeof(struct ip_set_comment), .align = __alignof__(struct ip_set_comment), .destroy = ip_set_comment_free, }, }; EXPORT_SYMBOL_GPL(ip_set_extensions); static bool add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) { return ip_set_extensions[id].flag ? (flags & ip_set_extensions[id].flag) : !!tb[IPSET_ATTR_TIMEOUT]; } size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, size_t align) { enum ip_set_ext_id id; u32 cadt_flags = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) set->flags |= IPSET_CREATE_FLAG_FORCEADD; if (!align) align = 1; for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; if (align < ip_set_extensions[id].align) align = ip_set_extensions[id].align; len = ALIGN(len, ip_set_extensions[id].align); set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; len += ip_set_extensions[id].len; } return ALIGN(len, align); } EXPORT_SYMBOL_GPL(ip_set_elem_len); int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { u64 fullmark; if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_TIMEOUT]) { if (!SET_WITH_TIMEOUT(set)) return -IPSET_ERR_TIMEOUT; ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { if (!SET_WITH_COUNTER(set)) return -IPSET_ERR_COUNTER; if (tb[IPSET_ATTR_BYTES]) ext->bytes = be64_to_cpu(nla_get_be64( tb[IPSET_ATTR_BYTES])); if (tb[IPSET_ATTR_PACKETS]) ext->packets = be64_to_cpu(nla_get_be64( tb[IPSET_ATTR_PACKETS])); } if (tb[IPSET_ATTR_COMMENT]) { if (!SET_WITH_COMMENT(set)) return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } if (tb[IPSET_ATTR_SKBMARK]) { if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); ext->skbinfo.skbmark = fullmark >> 32; ext->skbinfo.skbmarkmask = fullmark & 0xffffffff; } if (tb[IPSET_ATTR_SKBPRIO]) { if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbinfo.skbprio = be32_to_cpu(nla_get_be32(tb[IPSET_ATTR_SKBPRIO])); } if (tb[IPSET_ATTR_SKBQUEUE]) { if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbinfo.skbqueue = be16_to_cpu(nla_get_be16(tb[IPSET_ATTR_SKBQUEUE])); } return 0; } EXPORT_SYMBOL_GPL(ip_set_get_extensions); static u64 ip_set_get_bytes(const struct ip_set_counter *counter) { return (u64)atomic64_read(&(counter)->bytes); } static u64 ip_set_get_packets(const struct ip_set_counter *counter) { return (u64)atomic64_read(&(counter)->packets); } static bool ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) { return nla_put_net64(skb, IPSET_ATTR_BYTES, cpu_to_be64(ip_set_get_bytes(counter)), IPSET_ATTR_PAD) || nla_put_net64(skb, IPSET_ATTR_PACKETS, cpu_to_be64(ip_set_get_packets(counter)), IPSET_ATTR_PAD); } static bool ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) { /* Send nonzero parameters only */ return ((skbinfo->skbmark || skbinfo->skbmarkmask) && nla_put_net64(skb, IPSET_ATTR_SKBMARK, cpu_to_be64((u64)skbinfo->skbmark << 32 | skbinfo->skbmarkmask), IPSET_ATTR_PAD)) || (skbinfo->skbprio && nla_put_net32(skb, IPSET_ATTR_SKBPRIO, cpu_to_be32(skbinfo->skbprio))) || (skbinfo->skbqueue && nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, cpu_to_be16(skbinfo->skbqueue))); } int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active) { if (SET_WITH_TIMEOUT(set)) { unsigned long *timeout = ext_timeout(e, set); if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(active ? ip_set_timeout_get(timeout) : *timeout))) return -EMSGSIZE; } if (SET_WITH_COUNTER(set) && ip_set_put_counter(skb, ext_counter(e, set))) return -EMSGSIZE; if (SET_WITH_COMMENT(set) && ip_set_put_comment(skb, ext_comment(e, set))) return -EMSGSIZE; if (SET_WITH_SKBINFO(set) && ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) return -EMSGSIZE; return 0; } EXPORT_SYMBOL_GPL(ip_set_put_extensions); static bool ip_set_match_counter(u64 counter, u64 match, u8 op) { switch (op) { case IPSET_COUNTER_NONE: return true; case IPSET_COUNTER_EQ: return counter == match; case IPSET_COUNTER_NE: return counter != match; case IPSET_COUNTER_LT: return counter < match; case IPSET_COUNTER_GT: return counter > match; } return false; } static void ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) { atomic64_add((long long)bytes, &(counter)->bytes); } static void ip_set_add_packets(u64 packets, struct ip_set_counter *counter) { atomic64_add((long long)packets, &(counter)->packets); } static void ip_set_update_counter(struct ip_set_counter *counter, const struct ip_set_ext *ext, u32 flags) { if (ext->packets != ULLONG_MAX && !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { ip_set_add_bytes(ext->bytes, counter); ip_set_add_packets(ext->packets, counter); } } static void ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { mext->skbinfo = *skbinfo; } bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set))) return false; if (SET_WITH_COUNTER(set)) { struct ip_set_counter *counter = ext_counter(data, set); ip_set_update_counter(counter, ext, flags); if (flags & IPSET_FLAG_MATCH_COUNTERS && !(ip_set_match_counter(ip_set_get_packets(counter), mext->packets, mext->packets_op) && ip_set_match_counter(ip_set_get_bytes(counter), mext->bytes, mext->bytes_op))) return false; } if (SET_WITH_SKBINFO(set)) ip_set_get_skbinfo(ext_skbinfo(data, set), ext, mext, flags); return true; } EXPORT_SYMBOL_GPL(ip_set_match_extensions); /* Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace * only and serialized by the nfnl mutex indirectly from nfnetlink. * * Sets are identified by their index in ip_set_list and the index * is used by the external references (set/SET netfilter modules). * * The set behind an index may change by swapping only, from userspace. */ static void __ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); set->ref++; write_unlock_bh(&ip_set_ref_lock); } static void __ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); BUG_ON(set->ref == 0); set->ref--; write_unlock_bh(&ip_set_ref_lock); } /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); BUG_ON(set->ref_netlink == 0); set->ref_netlink--; write_unlock_bh(&ip_set_ref_lock); } /* Add, del and test set entries from kernel. * * The set behind the index must exist and must be referenced * so it can't be destroyed (or changed) under our foot. */ static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); rcu_read_lock(); /* ip_set_list itself needs to be protected */ set = rcu_dereference(inst->ip_set_list)[index]; rcu_read_unlock(); return set; } static inline void ip_set_lock(struct ip_set *set) { if (!set->variant->region_lock) spin_lock_bh(&set->lock); } static inline void ip_set_unlock(struct ip_set *set) { if (!set->variant->region_lock) spin_unlock_bh(&set->lock); } int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(xt_net(par), index); int ret = 0; BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; rcu_read_lock_bh(); ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); rcu_read_unlock_bh(); if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); ip_set_lock(set); set->variant->kadt(set, skb, par, IPSET_ADD, opt); ip_set_unlock(set); ret = 1; } else { /* --return-nomatch: invert matched element */ if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) && (set->type->features & IPSET_TYPE_NOMATCH) && (ret > 0 || ret == -ENOTEMPTY)) ret = -ret; } /* Convert error codes to nomatch */ return (ret < 0 ? 0 : ret); } EXPORT_SYMBOL_GPL(ip_set_test); int ip_set_add(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(xt_net(par), index); int ret; BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); ip_set_unlock(set); return ret; } EXPORT_SYMBOL_GPL(ip_set_add); int ip_set_del(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(xt_net(par), index); int ret = 0; BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); ip_set_unlock(set); return ret; } EXPORT_SYMBOL_GPL(ip_set_del); /* Find set by name, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * */ ip_set_id_t ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) { ip_set_id_t i, index = IPSET_INVALID_ID; struct ip_set *s; struct ip_set_net *inst = ip_set_pernet(net); rcu_read_lock(); for (i = 0; i < inst->ip_set_max; i++) { s = rcu_dereference(inst->ip_set_list)[i]; if (s && STRNCMP(s->name, name)) { __ip_set_get(s); index = i; *set = s; break; } } rcu_read_unlock(); return index; } EXPORT_SYMBOL_GPL(ip_set_get_byname); /* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * */ static void __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) { struct ip_set *set; rcu_read_lock(); set = rcu_dereference(inst->ip_set_list)[index]; if (set) __ip_set_put(set); rcu_read_unlock(); } void ip_set_put_byindex(struct net *net, ip_set_id_t index) { struct ip_set_net *inst = ip_set_pernet(net); __ip_set_put_byindex(inst, index); } EXPORT_SYMBOL_GPL(ip_set_put_byindex); /* Get the name of a set behind a set index. * Set itself is protected by RCU, but its name isn't: to protect against * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the * name. */ void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) { struct ip_set *set = ip_set_rcu_get(net, index); BUG_ON(!set); read_lock_bh(&ip_set_ref_lock); strncpy(name, set->name, IPSET_MAXNAMELEN); read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); /* Routines to call by external subsystems, which do not * call nfnl_lock for us. */ /* Find set by index, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * * The nfnl mutex is used in the function. */ ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) { struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); if (index >= inst->ip_set_max) return IPSET_INVALID_ID; nfnl_lock(NFNL_SUBSYS_IPSET); set = ip_set(inst, index); if (set) __ip_set_get(set); else index = IPSET_INVALID_ID; nfnl_unlock(NFNL_SUBSYS_IPSET); return index; } EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); /* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * * The nfnl mutex is used in the function. */ void ip_set_nfnl_put(struct net *net, ip_set_id_t index) { struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); nfnl_lock(NFNL_SUBSYS_IPSET); if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ set = ip_set(inst, index); if (set) __ip_set_put(set); } nfnl_unlock(NFNL_SUBSYS_IPSET); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); /* Communication protocol with userspace over netlink. * * The commands are serialized by the nfnl mutex. */ static inline u8 protocol(const struct nlattr * const tb[]) { return nla_get_u8(tb[IPSET_ATTR_PROTOCOL]); } static inline bool protocol_failed(const struct nlattr * const tb[]) { return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) != IPSET_PROTOCOL; } static inline bool protocol_min_failed(const struct nlattr * const tb[]) { return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) < IPSET_PROTOCOL_MIN; } static inline u32 flag_exist(const struct nlmsghdr *nlh) { return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST; } static struct nlmsghdr * start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, enum ipset_cmd cmd) { return nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags, NFPROTO_IPV4, NFNETLINK_V0, 0); } /* Create a set */ static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1}, [IPSET_ATTR_REVISION] = { .type = NLA_U8 }, [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, }; static struct ip_set * find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) { struct ip_set *set = NULL; ip_set_id_t i; *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); if (set && STRNCMP(set->name, name)) { *id = i; break; } } return (*id == IPSET_INVALID_ID ? NULL : set); } static inline struct ip_set * find_set(struct ip_set_net *inst, const char *name) { ip_set_id_t id; return find_set_and_id(inst, name, &id); } static int find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, struct ip_set **set) { struct ip_set *s; ip_set_id_t i; *index = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (!s) { if (*index == IPSET_INVALID_ID) *index = i; } else if (STRNCMP(name, s->name)) { /* Name clash */ *set = s; return -EEXIST; } } if (*index == IPSET_INVALID_ID) /* No free slot remained */ return -IPSET_ERR_MAX_SETS; return 0; } static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { return -EOPNOTSUPP; } static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; u32 flags = flag_exist(info->nlh); int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_TYPENAME] || !attr[IPSET_ATTR_REVISION] || !attr[IPSET_ATTR_FAMILY] || (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])))) return -IPSET_ERR_PROTOCOL; name = nla_data(attr[IPSET_ATTR_SETNAME]); typename = nla_data(attr[IPSET_ATTR_TYPENAME]); family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); revision = nla_get_u8(attr[IPSET_ATTR_REVISION]); pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", name, typename, family_name(family), revision); /* First, and without any locks, allocate and initialize * a normal base set structure. */ set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; spin_lock_init(&set->lock); strlcpy(set->name, name, IPSET_MAXNAMELEN); set->family = family; set->revision = revision; /* Next, check that we know the type, and take * a reference on the type, to make sure it stays available * while constructing our new set. * * After referencing the type, we try to create the type * specific part of the set without holding any locks. */ ret = find_set_type_get(typename, family, revision, &set->type); if (ret) goto out; /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy, NULL)) { ret = -IPSET_ERR_PROTOCOL; goto put_out; } /* Set create flags depending on the type revision */ set->flags |= set->type->create_flags[revision]; ret = set->type->create(info->net, set, tb, flags); if (ret != 0) goto put_out; /* BTW, ret==0 here. */ /* Here, we have a valid, constructed set and we are protected * by the nfnl mutex. Find the first free index in ip_set_list * and check clashing. */ ret = find_free_id(inst, set->name, &index, &clash); if (ret == -EEXIST) { /* If this is the same set and requested, ignore error */ if ((flags & IPSET_FLAG_EXIST) && STRNCMP(set->type->name, clash->type->name) && set->type->family == clash->type->family && set->type->revision_min == clash->type->revision_min && set->type->revision_max == clash->type->revision_max && set->variant->same_set(set, clash)) ret = 0; goto cleanup; } else if (ret == -IPSET_ERR_MAX_SETS) { struct ip_set **list, **tmp; ip_set_id_t i = inst->ip_set_max + IP_SET_INC; if (i < inst->ip_set_max || i == IPSET_INVALID_ID) /* Wraparound */ goto cleanup; list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ tmp = ip_set_dereference(inst->ip_set_list); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ synchronize_net(); /* Use new list */ index = inst->ip_set_max; inst->ip_set_max = i; kvfree(tmp); ret = 0; } else if (ret) { goto cleanup; } /* Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); ip_set(inst, index) = set; return ret; cleanup: set->variant->destroy(set); put_out: module_put(set->type->me); out: kfree(set); return ret; } /* Destroy sets */ static const struct nla_policy ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, }; static void ip_set_destroy_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); /* Must call it without holding any lock */ set->variant->destroy(set); module_put(set->type->me); kfree(set); } static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; int ret = 0; if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; /* Must wait for flush to be really finished in list:set */ rcu_barrier(); /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call * ip_set_put|get_nfnl_* functions, that way we * can safely check references here. * * list:set timer can only decrement the reference * counter, so if it's already zero, we can proceed * without holding the lock. */ read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && (s->ref || s->ref_netlink)) { ret = -IPSET_ERR_BUSY; goto out; } } inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s) { ip_set(inst, i) = NULL; ip_set_destroy_set(s); } } /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { u32 flags = flag_exist(info->nlh); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { if (!(flags & IPSET_FLAG_EXIST)) ret = -ENOENT; goto out; } else if (s->ref || s->ref_netlink) { ret = -IPSET_ERR_BUSY; goto out; } ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); ip_set_destroy_set(s); } return 0; out: read_unlock_bh(&ip_set_ref_lock); return ret; } /* Flush sets */ static void ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); ip_set_lock(set); set->variant->flush(set); ip_set_unlock(set); } static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s) ip_set_flush_set(s); } } else { s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!s) return -ENOENT; ip_set_flush_set(s); } return 0; } /* Rename a set */ static const struct nla_policy ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, }; static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *s; const char *name2; ip_set_id_t i; int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; write_lock_bh(&ip_set_ref_lock); if (set->ref != 0 || set->ref_netlink != 0) { ret = -IPSET_ERR_REFERENCED; goto out; } name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && STRNCMP(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } } strncpy(set->name, name2, IPSET_MAXNAMELEN); out: write_unlock_bh(&ip_set_ref_lock); return ret; } /* Swap two sets so that name/index points to the other. * References and set names are also swapped. * * The commands are serialized by the nfnl mutex and references are * protected by the ip_set_ref_lock. The kernel interfaces * do not hold the mutex but the pointer settings are atomic * so the ip_set_list always contains valid pointers to the sets. */ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *from, *to; ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &from_id); if (!from) return -ENOENT; to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id); if (!to) return -IPSET_ERR_EXIST_SETNAME2; /* Features must not change. * Not an artifical restriction anymore, as we must prevent * possible loops created by swapping in setlist type of sets. */ if (!(from->type->features == to->type->features && from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; write_lock_bh(&ip_set_ref_lock); if (from->ref_netlink || to->ref_netlink) { write_unlock_bh(&ip_set_ref_lock); return -EBUSY; } strncpy(from_name, from->name, IPSET_MAXNAMELEN); strncpy(from->name, to->name, IPSET_MAXNAMELEN); strncpy(to->name, from_name, IPSET_MAXNAMELEN); swap(from->ref, to->ref); ip_set(inst, from_id) = to; ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); return 0; } /* List/save set data */ #define DUMP_INIT 0 #define DUMP_ALL 1 #define DUMP_ONE 2 #define DUMP_LAST 3 #define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) #define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) int ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) { u32 cadt_flags = 0; if (SET_WITH_TIMEOUT(set)) if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(set->timeout)))) return -EMSGSIZE; if (SET_WITH_COUNTER(set)) cadt_flags |= IPSET_FLAG_WITH_COUNTERS; if (SET_WITH_COMMENT(set)) cadt_flags |= IPSET_FLAG_WITH_COMMENT; if (SET_WITH_SKBINFO(set)) cadt_flags |= IPSET_FLAG_WITH_SKBINFO; if (SET_WITH_FORCEADD(set)) cadt_flags |= IPSET_FLAG_WITH_FORCEADD; if (!cadt_flags) return 0; return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); } EXPORT_SYMBOL_GPL(ip_set_put_flags); static int ip_set_dump_done(struct netlink_callback *cb) { if (cb->args[IPSET_CB_ARG0]) { struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; struct ip_set *set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); __ip_set_put_netlink(set); } return 0; } static inline void dump_attrs(struct nlmsghdr *nlh) { const struct nlattr *attr; int rem; pr_debug("dump nlmsg\n"); nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) { pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len); } } static const struct nla_policy ip_set_dump_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_FLAGS] = { .type = NLA_U32 }, }; static int ip_set_dump_start(struct netlink_callback *cb) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; struct sk_buff *skb = cb->skb; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type; int ret; ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, ip_set_dump_policy, NULL); if (ret) goto error; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { ip_set_id_t index; struct ip_set *set; set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); if (!set) { ret = -ENOENT; goto error; } dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; } else { dump_type = DUMP_ALL; } if (cda[IPSET_ATTR_FLAGS]) { u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); dump_type |= (f << 16); } cb->args[IPSET_CB_NET] = (unsigned long)inst; cb->args[IPSET_CB_DUMP] = dump_type; return 0; error: /* We have to create and send the error message manually :-( */ if (nlh->nlmsg_flags & NLM_F_ACK) { netlink_ack(cb->skb, nlh, ret, NULL); } return ret; } static int ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb) { ip_set_id_t index = IPSET_INVALID_ID, max; struct ip_set *set = NULL; struct nlmsghdr *nlh = NULL; unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type, dump_flags; bool is_destroyed; int ret = 0; if (!cb->args[IPSET_CB_DUMP]) return -EINVAL; if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max) goto out; dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]); dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]); max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1 : inst->ip_set_max; dump_last: pr_debug("dump type, flag: %u %u index: %ld\n", dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; write_lock_bh(&ip_set_ref_lock); set = ip_set(inst, index); is_destroyed = inst->is_destroyed; if (!set || is_destroyed) { write_unlock_bh(&ip_set_ref_lock); if (dump_type == DUMP_ONE) { ret = -ENOENT; goto out; } if (is_destroyed) { /* All sets are just being destroyed */ ret = 0; goto out; } continue; } /* When dumping all sets, we must dump "sorted" * so that lists (unions of sets) are dumped last. */ if (dump_type != DUMP_ONE && ((dump_type == DUMP_ALL) == !!(set->type->features & IPSET_DUMP_LAST))) { write_unlock_bh(&ip_set_ref_lock); continue; } pr_debug("List set: %s\n", set->name); if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); set->ref_netlink++; } write_unlock_bh(&ip_set_ref_lock); nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, IPSET_CMD_LIST); if (!nlh) { ret = -EMSGSIZE; goto release_refcount; } if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, cb->args[IPSET_CB_PROTO]) || nla_put_string(skb, IPSET_ATTR_SETNAME, set->name)) goto nla_put_failure; if (dump_flags & IPSET_FLAG_LIST_SETNAME) goto next_set; switch (cb->args[IPSET_CB_ARG0]) { case 0: /* Core header data */ if (nla_put_string(skb, IPSET_ATTR_TYPENAME, set->type->name) || nla_put_u8(skb, IPSET_ATTR_FAMILY, set->family) || nla_put_u8(skb, IPSET_ATTR_REVISION, set->revision)) goto nla_put_failure; if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN && nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index))) goto nla_put_failure; ret = set->variant->head(set, skb); if (ret < 0) goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; if (set->variant->uref) set->variant->uref(set, cb, true); fallthrough; default: ret = set->variant->list(set, skb, cb); if (!cb->args[IPSET_CB_ARG0]) /* Set is done, proceed with next one */ goto next_set; goto release_refcount; } } /* If we dump all sets, continue with dumping last ones */ if (dump_type == DUMP_ALL) { dump_type = DUMP_LAST; cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); cb->args[IPSET_CB_INDEX] = 0; if (set && set->variant->uref) set->variant->uref(set, cb, false); goto dump_last; } goto out; nla_put_failure: ret = -EFAULT; next_set: if (dump_type == DUMP_ONE) cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID; else cb->args[IPSET_CB_INDEX]++; release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); __ip_set_put_netlink(set); cb->args[IPSET_CB_ARG0] = 0; } out: if (nlh) { nlmsg_end(skb, nlh); pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len); dump_attrs(nlh); } return ret < 0 ? ret : skb->len; } static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; { struct netlink_dump_control c = { .start = ip_set_dump_start, .dump = ip_set_dump_do, .done = ip_set_dump_done, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } } /* Add, del and test */ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, [IPSET_ATTR_ADT] = { .type = NLA_NESTED }, }; static int call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 flags, bool use_lineno) { int ret; u32 lineno = 0; bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); retried = true; } while (ret == -ERANGE || (ret == -EAGAIN && set->variant->resize && (ret = set->variant->resize(set, retried)) == 0)); if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) return 0; if (lineno && use_lineno) { /* Error in restore/batch mode: send back lineno */ struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; struct nlmsgerr *errmsg; size_t payload = min(SIZE_MAX, sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *cmdattr; u32 *errline; skb2 = nlmsg_new(payload, GFP_KERNEL); if (!skb2) return -ENOMEM; rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = ret; memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, nlh->nlmsg_len - min_len, ip_set_adt_policy, NULL); if (ret) { nlmsg_free(skb2); return ret; } errline = nla_data(cda[IPSET_ATTR_LINENO]); *errline = lineno; nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } return ret; } static int ip_set_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, enum ipset_adt adt, const struct nlmsghdr *nlh, const struct nlattr * const attr[], struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; if (attr[IPSET_ATTR_DATA]) { if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; } } return ret; } static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { return ip_set_ad(info->net, info->sk, skb, IPSET_ADD, info->nlh, attr, info->extack); } static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { return ip_set_ad(info->net, info->sk, skb, IPSET_DEL, info->nlh, attr, info->extack); } static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; u32 lineno; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_DATA] || !flag_nested(attr[IPSET_ATTR_DATA]))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0); rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) ret = 1; return ret > 0 ? 0 : -IPSET_ERR_EXIST; } /* Get headed data of a set */ static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_HEADER); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) || nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision)) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } /* Get type data */ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, }; static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; u8 family, min, max; const char *typename; int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_TYPENAME] || !attr[IPSET_ATTR_FAMILY])) return -IPSET_ERR_PROTOCOL; family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); typename = nla_data(attr[IPSET_ATTR_TYPENAME]); ret = find_set_type_minmax(typename, family, &min, &max); if (ret) return ret; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_TYPE); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) || nla_put_u8(skb2, IPSET_ATTR_REVISION, max) || nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min)) goto nla_put_failure; nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } /* Get protocol version */ static const struct nla_policy ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, }; static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_PROTOCOL); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL)) goto nla_put_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL_MIN, IPSET_PROTOCOL_MIN)) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } /* Get set by name or index, from userspace */ static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &id); if (id == IPSET_INVALID_ID) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYNAME); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || nla_put_net16(skb2, IPSET_ATTR_INDEX, htons(id))) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_INDEX] = { .type = NLA_U16 }, }; static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_INDEX])) return -IPSET_ERR_PROTOCOL; id = ip_set_get_h16(attr[IPSET_ATTR_INDEX]); if (id >= inst->ip_set_max) return -ENOENT; set = ip_set(inst, id); if (set == NULL) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYINDEX); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name)) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_NONE] = { .call = ip_set_none, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, }, [IPSET_CMD_CREATE] = { .call = ip_set_create, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_create_policy, }, [IPSET_CMD_DESTROY] = { .call = ip_set_destroy, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_FLUSH] = { .call = ip_set_flush, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_RENAME] = { .call = ip_set_rename, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_SWAP] = { .call = ip_set_swap, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_LIST] = { .call = ip_set_dump, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_ADD] = { .call = ip_set_uadd, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_DEL] = { .call = ip_set_udel, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_TEST] = { .call = ip_set_utest, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_HEADER] = { .call = ip_set_header, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_TYPE] = { .call = ip_set_type, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_type_policy, }, [IPSET_CMD_PROTOCOL] = { .call = ip_set_protocol, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_protocol_policy, }, [IPSET_CMD_GET_BYNAME] = { .call = ip_set_byname, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_GET_BYINDEX] = { .call = ip_set_byindex, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_index_policy, }, }; static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { .name = "ip_set", .subsys_id = NFNL_SUBSYS_IPSET, .cb_count = IPSET_MSG_MAX, .cb = ip_set_netlink_subsys_cb, }; /* Interface to iptables/ip6tables */ static int ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) { unsigned int *op; void *data; int copylen = *len, ret = 0; struct net *net = sock_net(sk); struct ip_set_net *inst = ip_set_pernet(net); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (optval != SO_IP_SET) return -EBADF; if (*len < sizeof(unsigned int)) return -EINVAL; data = vmalloc(*len); if (!data) return -ENOMEM; if (copy_from_user(data, user, *len) != 0) { ret = -EFAULT; goto done; } op = data; if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ struct ip_set_req_version *req_version = data; if (*len < sizeof(struct ip_set_req_version)) { ret = -EINVAL; goto done; } if (req_version->version < IPSET_PROTOCOL_MIN) { ret = -EPROTO; goto done; } } switch (*op) { case IP_SET_OP_VERSION: { struct ip_set_req_version *req_version = data; if (*len != sizeof(struct ip_set_req_version)) { ret = -EINVAL; goto done; } req_version->version = IPSET_PROTOCOL; if (copy_to_user(user, req_version, sizeof(struct ip_set_req_version))) ret = -EFAULT; goto done; } case IP_SET_OP_GET_BYNAME: { struct ip_set_req_get_set *req_get = data; ip_set_id_t id; if (*len != sizeof(struct ip_set_req_get_set)) { ret = -EINVAL; goto done; } req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; nfnl_lock(NFNL_SUBSYS_IPSET); find_set_and_id(inst, req_get->set.name, &id); req_get->set.index = id; nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } case IP_SET_OP_GET_FNAME: { struct ip_set_req_get_set_family *req_get = data; ip_set_id_t id; if (*len != sizeof(struct ip_set_req_get_set_family)) { ret = -EINVAL; goto done; } req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; nfnl_lock(NFNL_SUBSYS_IPSET); find_set_and_id(inst, req_get->set.name, &id); req_get->set.index = id; if (id != IPSET_INVALID_ID) req_get->family = ip_set(inst, id)->family; nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } case IP_SET_OP_GET_BYINDEX: { struct ip_set_req_get_set *req_get = data; struct ip_set *set; if (*len != sizeof(struct ip_set_req_get_set) || req_get->set.index >= inst->ip_set_max) { ret = -EINVAL; goto done; } nfnl_lock(NFNL_SUBSYS_IPSET); set = ip_set(inst, req_get->set.index); ret = strscpy(req_get->set.name, set ? set->name : "", IPSET_MAXNAMELEN); nfnl_unlock(NFNL_SUBSYS_IPSET); if (ret < 0) goto done; goto copy; } default: ret = -EBADMSG; goto done; } /* end of switch(op) */ copy: if (copy_to_user(user, data, copylen)) ret = -EFAULT; done: vfree(data); if (ret > 0) ret = 0; return ret; } static struct nf_sockopt_ops so_set __read_mostly = { .pf = PF_INET, .get_optmin = SO_IP_SET, .get_optmax = SO_IP_SET + 1, .get = ip_set_sockfn_get, .owner = THIS_MODULE, }; static int __net_init ip_set_net_init(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set **list; inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX; if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; inst->is_deleted = false; inst->is_destroyed = false; rcu_assign_pointer(inst->ip_set_list, list); return 0; } static void __net_exit ip_set_net_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set = NULL; ip_set_id_t i; inst->is_deleted = true; /* flag for ip_set_nfnl_put */ nfnl_lock(NFNL_SUBSYS_IPSET); for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); if (set) { ip_set(inst, i) = NULL; ip_set_destroy_set(set); } } nfnl_unlock(NFNL_SUBSYS_IPSET); kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), }; static int __init ip_set_init(void) { int ret = register_pernet_subsys(&ip_set_net_ops); if (ret) { pr_err("ip_set: cannot register pernet_subsys.\n"); return ret; } ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); unregister_pernet_subsys(&ip_set_net_ops); return ret; } ret = nf_register_sockopt(&so_set); if (ret != 0) { pr_err("SO_SET registry failed: %d\n", ret); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); unregister_pernet_subsys(&ip_set_net_ops); return ret; } return 0; } static void __exit ip_set_fini(void) { nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); unregister_pernet_subsys(&ip_set_net_ops); pr_debug("these are the famous last words\n"); } module_init(ip_set_init); module_exit(ip_set_fini); MODULE_DESCRIPTION("ip_set: protocol " __stringify(IPSET_PROTOCOL));
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); void batadv_softif_vlan_release(struct kref *ref); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); /** * batadv_softif_vlan_put() - decrease the vlan object refcounter and * possibly release it * @vlan: the vlan object to release */ static inline void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) { if (!vlan) return; kref_put(&vlan->refcount, batadv_softif_vlan_release); } #endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */
1084 1085 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007, 2008, 2009 Siemens AG */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/device.h> #include <net/cfg802154.h> #include <net/rtnetlink.h> #include "ieee802154.h" #include "nl802154.h" #include "sysfs.h" #include "core.h" /* name for sysfs, %d is appended */ #define PHY_NAME "phy" /* RCU-protected (and RTNL for writers) */ LIST_HEAD(cfg802154_rdev_list); int cfg802154_rdev_list_generation; struct wpan_phy *wpan_phy_find(const char *str) { struct device *dev; if (WARN_ON(!str)) return NULL; dev = class_find_device_by_name(&wpan_phy_class, str); if (!dev) return NULL; return container_of(dev, struct wpan_phy, dev); } EXPORT_SYMBOL(wpan_phy_find); struct wpan_phy_iter_data { int (*fn)(struct wpan_phy *phy, void *data); void *data; }; static int wpan_phy_iter(struct device *dev, void *_data) { struct wpan_phy_iter_data *wpid = _data; struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); return wpid->fn(phy, wpid->data); } int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data), void *data) { struct wpan_phy_iter_data wpid = { .fn = fn, .data = data, }; return class_for_each_device(&wpan_phy_class, NULL, &wpid, wpan_phy_iter); } EXPORT_SYMBOL(wpan_phy_for_each); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx) { struct cfg802154_registered_device *result = NULL, *rdev; ASSERT_RTNL(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (rdev->wpan_phy_idx == wpan_phy_idx) { result = rdev; break; } } return result; } struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx) { struct cfg802154_registered_device *rdev; ASSERT_RTNL(); rdev = cfg802154_rdev_by_wpan_phy_idx(wpan_phy_idx); if (!rdev) return NULL; return &rdev->wpan_phy; } struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) { static atomic_t wpan_phy_counter = ATOMIC_INIT(0); struct cfg802154_registered_device *rdev; size_t alloc_size; alloc_size = sizeof(*rdev) + priv_size; rdev = kzalloc(alloc_size, GFP_KERNEL); if (!rdev) return NULL; rdev->ops = ops; rdev->wpan_phy_idx = atomic_inc_return(&wpan_phy_counter); if (unlikely(rdev->wpan_phy_idx < 0)) { /* ugh, wrapped! */ atomic_dec(&wpan_phy_counter); kfree(rdev); return NULL; } /* atomic_inc_return makes it start at 1, make it start at 0 */ rdev->wpan_phy_idx--; INIT_LIST_HEAD(&rdev->wpan_dev_list); device_initialize(&rdev->wpan_phy.dev); dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx); rdev->wpan_phy.dev.class = &wpan_phy_class; rdev->wpan_phy.dev.platform_data = rdev; wpan_phy_net_set(&rdev->wpan_phy, &init_net); init_waitqueue_head(&rdev->dev_wait); return &rdev->wpan_phy; } EXPORT_SYMBOL(wpan_phy_new); int wpan_phy_register(struct wpan_phy *phy) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy); int ret; rtnl_lock(); ret = device_add(&phy->dev); if (ret) { rtnl_unlock(); return ret; } list_add_rcu(&rdev->list, &cfg802154_rdev_list); cfg802154_rdev_list_generation++; /* TODO phy registered lock */ rtnl_unlock(); /* TODO nl802154 phy notify */ return 0; } EXPORT_SYMBOL(wpan_phy_register); void wpan_phy_unregister(struct wpan_phy *phy) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy); wait_event(rdev->dev_wait, ({ int __count; rtnl_lock(); __count = rdev->opencount; rtnl_unlock(); __count == 0; })); rtnl_lock(); /* TODO nl802154 phy notify */ /* TODO phy registered lock */ WARN_ON(!list_empty(&rdev->wpan_dev_list)); /* First remove the hardware from everywhere, this makes * it impossible to find from userspace. */ list_del_rcu(&rdev->list); synchronize_rcu(); cfg802154_rdev_list_generation++; device_del(&phy->dev); rtnl_unlock(); } EXPORT_SYMBOL(wpan_phy_unregister); void wpan_phy_free(struct wpan_phy *phy) { put_device(&phy->dev); } EXPORT_SYMBOL(wpan_phy_free); int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, struct net *net) { struct wpan_dev *wpan_dev; int err = 0; list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); if (err) break; wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; } if (err) { /* failed -- clean up to old netns */ net = wpan_phy_net(&rdev->wpan_phy); list_for_each_entry_continue_reverse(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); WARN_ON(err); wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; } return err; } wpan_phy_net_set(&rdev->wpan_phy, net); err = device_rename(&rdev->wpan_phy.dev, dev_name(&rdev->wpan_phy.dev)); WARN_ON(err); return 0; } void cfg802154_dev_free(struct cfg802154_registered_device *rdev) { kfree(rdev); } static void cfg802154_update_iface_num(struct cfg802154_registered_device *rdev, int iftype, int num) { ASSERT_RTNL(); rdev->num_running_ifaces += num; } static int cfg802154_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct cfg802154_registered_device *rdev; if (!wpan_dev) return NOTIFY_DONE; rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy); /* TODO WARN_ON unspec type */ switch (state) { /* TODO NETDEV_DEVTYPE */ case NETDEV_REGISTER: dev->features |= NETIF_F_NETNS_LOCAL; wpan_dev->identifier = ++rdev->wpan_dev_id; list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list); rdev->devlist_generation++; wpan_dev->netdev = dev; break; case NETDEV_DOWN: cfg802154_update_iface_num(rdev, wpan_dev->iftype, -1); rdev->opencount--; wake_up(&rdev->dev_wait); break; case NETDEV_UP: cfg802154_update_iface_num(rdev, wpan_dev->iftype, 1); rdev->opencount++; break; case NETDEV_UNREGISTER: /* It is possible to get NETDEV_UNREGISTER * multiple times. To detect that, check * that the interface is still on the list * of registered interfaces, and only then * remove and clean it up. */ if (!list_empty(&wpan_dev->list)) { list_del_rcu(&wpan_dev->list); rdev->devlist_generation++; } /* synchronize (so that we won't find this netdev * from other code any more) and then clear the list * head so that the above code can safely check for * !list_empty() to avoid double-cleanup. */ synchronize_rcu(); INIT_LIST_HEAD(&wpan_dev->list); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block cfg802154_netdev_notifier = { .notifier_call = cfg802154_netdev_notifier_call, }; static void __net_exit cfg802154_pernet_exit(struct net *net) { struct cfg802154_registered_device *rdev; rtnl_lock(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (net_eq(wpan_phy_net(&rdev->wpan_phy), net)) WARN_ON(cfg802154_switch_netns(rdev, &init_net)); } rtnl_unlock(); } static struct pernet_operations cfg802154_pernet_ops = { .exit = cfg802154_pernet_exit, }; static int __init wpan_phy_class_init(void) { int rc; rc = register_pernet_device(&cfg802154_pernet_ops); if (rc) goto err; rc = wpan_phy_sysfs_init(); if (rc) goto err_sysfs; rc = register_netdevice_notifier(&cfg802154_netdev_notifier); if (rc) goto err_nl; rc = ieee802154_nl_init(); if (rc) goto err_notifier; rc = nl802154_init(); if (rc) goto err_ieee802154_nl; return 0; err_ieee802154_nl: ieee802154_nl_exit(); err_notifier: unregister_netdevice_notifier(&cfg802154_netdev_notifier); err_nl: wpan_phy_sysfs_exit(); err_sysfs: unregister_pernet_device(&cfg802154_pernet_ops); err: return rc; } subsys_initcall(wpan_phy_class_init); static void __exit wpan_phy_class_exit(void) { nl802154_exit(); ieee802154_nl_exit(); unregister_netdevice_notifier(&cfg802154_netdev_notifier); wpan_phy_sysfs_exit(); unregister_pernet_device(&cfg802154_pernet_ops); } module_exit(wpan_phy_class_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface"); MODULE_AUTHOR("Dmitry Eremin-Solenikov");
1090 1090 1090 1075 1090 1090 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/block_validity.c * * Copyright (C) 2009 * Theodore Ts'o (tytso@mit.edu) * * Track which blocks in the filesystem are metadata blocks that * should never be used as data blocks by files or directories. */ #include <linux/time.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/slab.h> #include "ext4.h" struct ext4_system_zone { struct rb_node node; ext4_fsblk_t start_blk; unsigned int count; u32 ino; }; static struct kmem_cache *ext4_system_zone_cachep; int __init ext4_init_system_zone(void) { ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); if (ext4_system_zone_cachep == NULL) return -ENOMEM; return 0; } void ext4_exit_system_zone(void) { rcu_barrier(); kmem_cache_destroy(ext4_system_zone_cachep); } static inline int can_merge(struct ext4_system_zone *entry1, struct ext4_system_zone *entry2) { if ((entry1->start_blk + entry1->count) == entry2->start_blk && entry1->ino == entry2->ino) return 1; return 0; } static void release_system_zone(struct ext4_system_blocks *system_blks) { struct ext4_system_zone *entry, *n; rbtree_postorder_for_each_entry_safe(entry, n, &system_blks->root, node) kmem_cache_free(ext4_system_zone_cachep, entry); } /* * Mark a range of blocks as belonging to the "system zone" --- that * is, filesystem metadata blocks which should never be used by * inodes. */ static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, unsigned int count, u32 ino) { struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_system_zone, node); if (start_blk < entry->start_blk) n = &(*n)->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else /* Unexpected overlap of system zones. */ return -EFSCORRUPTED; } new_entry = kmem_cache_alloc(ext4_system_zone_cachep, GFP_KERNEL); if (!new_entry) return -ENOMEM; new_entry->start_blk = start_blk; new_entry->count = count; new_entry->ino = ino; new_node = &new_entry->node; rb_link_node(new_node, parent, n); rb_insert_color(new_node, &system_blks->root); /* Can we merge to the left? */ node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(entry, new_entry)) { new_entry->start_blk = entry->start_blk; new_entry->count += entry->count; rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } /* Can we merge to the right? */ node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(new_entry, entry)) { new_entry->count += entry->count; rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } return 0; } static void debug_print_tree(struct ext4_sb_info *sbi) { struct rb_node *node; struct ext4_system_zone *entry; struct ext4_system_blocks *system_blks; int first = 1; printk(KERN_INFO "System zones: "); rcu_read_lock(); system_blks = rcu_dereference(sbi->s_system_blks); node = rb_first(&system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", entry->start_blk, entry->start_blk + entry->count - 1); first = 0; node = rb_next(node); } rcu_read_unlock(); printk(KERN_CONT "\n"); } static int ext4_protect_reserved_inode(struct super_block *sb, struct ext4_system_blocks *system_blks, u32 ino) { struct inode *inode; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_map_blocks map; u32 i = 0, num; int err = 0, n; if ((ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(sbi->s_es->s_inodes_count))) return -EINVAL; inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) return PTR_ERR(inode); num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; while (i < num) { cond_resched(); map.m_lblk = i; map.m_len = num - i; n = ext4_map_blocks(NULL, inode, &map, 0); if (n < 0) { err = n; break; } if (n == 0) { i++; } else { err = add_system_zone(system_blks, map.m_pblk, n, ino); if (err < 0) { if (err == -EFSCORRUPTED) { EXT4_ERROR_INODE_ERR(inode, -err, "blocks %llu-%llu from inode overlap system zone", map.m_pblk, map.m_pblk + map.m_len - 1); } break; } i += n; } } iput(inode); return err; } static void ext4_destroy_system_zone(struct rcu_head *rcu) { struct ext4_system_blocks *system_blks; system_blks = container_of(rcu, struct ext4_system_blocks, rcu); release_system_zone(system_blks); kfree(system_blks); } /* * Build system zone rbtree which is used for block validity checking. * * The update of system_blks pointer in this function is protected by * sb->s_umount semaphore. However we have to be careful as we can be * racing with ext4_inode_block_valid() calls reading system_blks rbtree * protected only by RCU. That's why we first build the rbtree and then * swap it in place. */ int ext4_setup_system_zone(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; int flex_size = ext4_flex_bg_size(sbi); int ret; system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); if (!system_blks) return -ENOMEM; for (i=0; i < ngroups; i++) { cond_resched(); if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) { ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), ext4_bg_num_gdb(sb, i) + 1, 0); if (ret) goto err; } gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(system_blks, ext4_block_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_table(sb, gdp), sbi->s_itb_per_group, 0); if (ret) goto err; } if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { ret = ext4_protect_reserved_inode(sb, system_blks, le32_to_cpu(sbi->s_es->s_journal_inum)); if (ret) goto err; } /* * System blks rbtree complete, announce it once to prevent racing * with ext4_inode_block_valid() accessing the rbtree at the same * time. */ rcu_assign_pointer(sbi->s_system_blks, system_blks); if (test_opt(sb, DEBUG)) debug_print_tree(sbi); return 0; err: release_system_zone(system_blks); kfree(system_blks); return ret; } /* * Called when the filesystem is unmounted or when remounting it with * noblock_validity specified. * * The update of system_blks pointer in this function is protected by * sb->s_umount semaphore. However we have to be careful as we can be * racing with ext4_inode_block_valid() calls reading system_blks rbtree * protected only by RCU. So we first clear the system_blks pointer and * then free the rbtree only after RCU grace period expires. */ void ext4_release_system_zone(struct super_block *sb) { struct ext4_system_blocks *system_blks; system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks, lockdep_is_held(&sb->s_umount)); rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL); if (system_blks) call_rcu(&system_blks->rcu, ext4_destroy_system_zone); } int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_system_blocks *system_blks; struct ext4_system_zone *entry; struct rb_node *n; int ret = 1; if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (start_blk + count < start_blk) || (start_blk + count > ext4_blocks_count(sbi->s_es))) return 0; /* * Lock the system zone to prevent it being released concurrently * when doing a remount which inverse current "[no]block_validity" * mount option. */ rcu_read_lock(); system_blks = rcu_dereference(sbi->s_system_blks); if (system_blks == NULL) goto out_rcu; n = system_blks->root.rb_node; while (n) { entry = rb_entry(n, struct ext4_system_zone, node); if (start_blk + count - 1 < entry->start_blk) n = n->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; else { ret = 0; if (inode) ret = (entry->ino == inode->i_ino); break; } } out_rcu: rcu_read_unlock(); return ret; } /* * Returns 1 if the passed-in block region (start_blk, * start_blk+count) is valid; 0 if some part of the block region * overlaps with some other filesystem metadata blocks. */ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count); } int ext4_check_blockref(const char *function, unsigned int line, struct inode *inode, __le32 *p, unsigned int max) { __le32 *bref = p; unsigned int blk; if (ext4_has_feature_journal(inode->i_sb) && (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; while (bref < p+max) { blk = le32_to_cpu(*bref++); if (blk && unlikely(!ext4_inode_block_valid(inode, blk, 1))) { ext4_error_inode(inode, function, line, blk, "invalid block"); return -EFSCORRUPTED; } } return 0; }
564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/ip.h> #include <linux/sctp.h> #include <net/ip.h> #include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/sctp/checksum.h> #include <net/ip_vs.h> static int sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); static int sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct sctp_chunkhdr _schunkh, *sch; struct sctphdr *sh, _sctph; __be16 _ports[2], *ports = NULL; if (likely(!ip_vs_iph_icmp(iph))) { sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); if (sh) { sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), sizeof(_schunkh), &_schunkh); if (sch) { if (sch->type == SCTP_CID_ABORT || !(sysctl_sloppy_sctp(ipvs) || sch->type == SCTP_CID_INIT)) return 1; ports = &sh->source; } } } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static void sctp_nat_csum(struct sk_buff *skb, struct sctphdr *sctph, unsigned int sctphoff) { sctph->checksum = sctp_compute_cksum(skb, sctphoff); skb->ip_summed = CHECKSUM_UNNECESSARY; } static int sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct sctphdr *sctph; unsigned int sctphoff = iph->len; bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!sctp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ ret = ip_vs_app_pkt_out(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 2) payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; /* Only update csum if we really have to */ if (sctph->source != cp->vport || payload_csum || skb->ip_summed == CHECKSUM_PARTIAL) { sctph->source = cp->vport; sctp_nat_csum(skb, sctph, sctphoff); } else { skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct sctphdr *sctph; unsigned int sctphoff = iph->len; bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!sctp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ ret = ip_vs_app_pkt_in(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 2) payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; /* Only update csum if we really have to */ if (sctph->dest != cp->dport || payload_csum || (skb->ip_summed == CHECKSUM_PARTIAL && !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { unsigned int sctphoff; struct sctphdr *sh; __le32 cmp, val; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) sctphoff = sizeof(struct ipv6hdr); else #endif sctphoff = ip_hdrlen(skb); sh = (struct sctphdr *)(skb->data + sctphoff); cmp = sh->checksum; val = sctp_compute_cksum(skb, sctphoff); if (val != cmp) { /* CRC failure, dump it. */ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } return 1; } enum ipvs_sctp_event_t { IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */ IP_VS_SCTP_INIT, IP_VS_SCTP_INIT_ACK, IP_VS_SCTP_COOKIE_ECHO, IP_VS_SCTP_COOKIE_ACK, IP_VS_SCTP_SHUTDOWN, IP_VS_SCTP_SHUTDOWN_ACK, IP_VS_SCTP_SHUTDOWN_COMPLETE, IP_VS_SCTP_ERROR, IP_VS_SCTP_ABORT, IP_VS_SCTP_EVENT_LAST }; /* RFC 2960, 3.2 Chunk Field Descriptions */ static __u8 sctp_events[] = { [SCTP_CID_DATA] = IP_VS_SCTP_DATA, [SCTP_CID_INIT] = IP_VS_SCTP_INIT, [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK, [SCTP_CID_SACK] = IP_VS_SCTP_DATA, [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA, [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA, [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT, [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN, [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK, [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR, [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO, [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK, [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA, [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA, [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE, }; /* SCTP States: * See RFC 2960, 4. SCTP Association State Diagram * * New states (not in diagram): * - INIT1 state: use shorter timeout for dropped INIT packets * - REJECTED state: use shorter timeout if INIT is rejected with ABORT * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging * * The states are as seen in real server. In the diagram, INIT1, INIT, * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. * * States as per packets from client (C) and server (S): * * Setup of client connection: * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK * * Setup of server connection: * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK */ #define sNO IP_VS_SCTP_S_NONE #define sI1 IP_VS_SCTP_S_INIT1 #define sIN IP_VS_SCTP_S_INIT #define sCS IP_VS_SCTP_S_COOKIE_SENT #define sCR IP_VS_SCTP_S_COOKIE_REPLIED #define sCW IP_VS_SCTP_S_COOKIE_WAIT #define sCO IP_VS_SCTP_S_COOKIE #define sCE IP_VS_SCTP_S_COOKIE_ECHOED #define sES IP_VS_SCTP_S_ESTABLISHED #define sSS IP_VS_SCTP_S_SHUTDOWN_SENT #define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED #define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT #define sRJ IP_VS_SCTP_S_REJECTED #define sCL IP_VS_SCTP_S_CLOSED static const __u8 sctp_states [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = { { /* INPUT */ /* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ /* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, /* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL}, /* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, /* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL}, /* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL}, /* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL}, /* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, }, { /* OUTPUT */ /* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ /* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW}, /* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, /* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL}, /* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL}, /* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, }, { /* INPUT-ONLY */ /* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ /* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, /* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, /* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, /* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL}, /* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL}, /* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, }, }; #define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ) /* Timeout table[state] */ static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { [IP_VS_SCTP_S_NONE] = 2 * HZ, [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ, [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ, [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_LAST] = 2 * HZ, }; static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { [IP_VS_SCTP_S_NONE] = "NONE", [IP_VS_SCTP_S_INIT1] = "INIT1", [IP_VS_SCTP_S_INIT] = "INIT", [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT", [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED", [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT", [IP_VS_SCTP_S_COOKIE] = "COOKIE", [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED", [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED", [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT", [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED", [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT", [IP_VS_SCTP_S_REJECTED] = "REJECTED", [IP_VS_SCTP_S_CLOSED] = "CLOSED", [IP_VS_SCTP_S_LAST] = "BUG!", }; static const char *sctp_state_name(int state) { if (state >= IP_VS_SCTP_S_LAST) return "ERR!"; if (sctp_state_name_table[state]) return sctp_state_name_table[state]; return "?"; } static inline void set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, const struct sk_buff *skb) { struct sctp_chunkhdr _sctpch, *sch; unsigned char chunk_type; int event, next_state; int ihl, cofs; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); #else ihl = ip_hdrlen(skb); #endif cofs = ihl + sizeof(struct sctphdr); sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); if (sch == NULL) return; chunk_type = sch->type; /* * Section 3: Multiple chunks can be bundled into one SCTP packet * up to the MTU size, except for the INIT, INIT ACK, and * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with * any other chunk in a packet. * * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be * bundled with an ABORT, but they MUST be placed before the ABORT * in the SCTP packet or they will be ignored by the receiver. */ if ((sch->type == SCTP_CID_COOKIE_ECHO) || (sch->type == SCTP_CID_COOKIE_ACK)) { int clen = ntohs(sch->length); if (clen >= sizeof(_sctpch)) { sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), sizeof(_sctpch), &_sctpch); if (sch && sch->type == SCTP_CID_ABORT) chunk_type = sch->type; } } event = (chunk_type < sizeof(sctp_events)) ? sctp_events[chunk_type] : IP_VS_SCTP_DATA; /* Update direction to INPUT_ONLY if necessary * or delete NO_OUTPUT flag if output packet detected */ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { if (direction == IP_VS_DIR_OUTPUT) cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; else direction = IP_VS_DIR_INPUT_ONLY; } next_state = sctp_states[direction][event][cp->state]; if (next_state != cp->state) { struct ip_vs_dest *dest = cp->dest; IP_VS_DBG_BUF(8, "%s %s %s:%d->" "%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((direction == IP_VS_DIR_OUTPUT) ? "output " : "input "), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), sctp_state_name(cp->state), sctp_state_name(next_state), refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && (next_state != IP_VS_SCTP_S_ESTABLISHED)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && (next_state == IP_VS_SCTP_S_ESTABLISHED)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } if (next_state == IP_VS_SCTP_S_ESTABLISHED) ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = next_state]; else /* What to do ? */ cp->timeout = sctp_timeouts[cp->state = next_state]; } static void sctp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { spin_lock_bh(&cp->lock); set_sctp_state(pd, cp, direction, skb); spin_unlock_bh(&cp->lock); } static inline __u16 sctp_app_hashkey(__be16 port) { return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) & SCTP_APP_TAB_MASK; } static int sctp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); hash = sctp_app_hashkey(port); list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void sctp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int sctp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ static int __ip_vs_sctp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); if (!pd->timeout_table) return -ENOMEM; return 0; } static void __ip_vs_sctp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_sctp = { .name = "SCTP", .protocol = IPPROTO_SCTP, .num_states = IP_VS_SCTP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __ip_vs_sctp_init, .exit_netns = __ip_vs_sctp_exit, .register_app = sctp_register_app, .unregister_app = sctp_unregister_app, .conn_schedule = sctp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = sctp_snat_handler, .dnat_handler = sctp_dnat_handler, .state_name = sctp_state_name, .state_transition = sctp_state_transition, .app_conn_bind = sctp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, };
82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RANDOM_H #define _LINUX_RANDOM_H #include <linux/bug.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/once.h> #include <uapi/linux/random.h> struct notifier_block; void add_device_randomness(const void *buf, size_t len); void __init add_bootloader_randomness(const void *buf, size_t len); void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; void add_interrupt_randomness(int irq) __latent_entropy; void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy); static inline void add_latent_entropy(void) { #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); #else add_device_randomness(NULL, 0); #endif } void get_random_bytes(void *buf, size_t len); size_t __must_check get_random_bytes_arch(void *buf, size_t len); u32 get_random_u32(void); u64 get_random_u64(void); static inline unsigned int get_random_int(void) { return get_random_u32(); } static inline unsigned long get_random_long(void) { #if BITS_PER_LONG == 64 return get_random_u64(); #else return get_random_u32(); #endif } /* * On 64-bit architectures, protect against non-terminated C string overflows * by zeroing out the first byte of the canary; this leaves 56 bits of entropy. */ #ifdef CONFIG_64BIT # ifdef __LITTLE_ENDIAN # define CANARY_MASK 0xffffffffffffff00UL # else /* big endian, 64 bits: */ # define CANARY_MASK 0x00ffffffffffffffUL # endif #else /* 32 bits: */ # define CANARY_MASK 0xffffffffUL #endif static inline unsigned long get_random_canary(void) { return get_random_long() & CANARY_MASK; } int __init random_init(const char *command_line); bool rng_is_initialized(void); int wait_for_random_bytes(void); int register_random_ready_notifier(struct notifier_block *nb); int unregister_random_ready_notifier(struct notifier_block *nb); /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes). * Returns the result of the call to wait_for_random_bytes. */ static inline int get_random_bytes_wait(void *buf, size_t nbytes) { int ret = wait_for_random_bytes(); get_random_bytes(buf, nbytes); return ret; } #define declare_get_random_var_wait(name, ret_type) \ static inline int get_random_ ## name ## _wait(ret_type *out) { \ int ret = wait_for_random_bytes(); \ if (unlikely(ret)) \ return ret; \ *out = get_random_ ## name(); \ return 0; \ } declare_get_random_var_wait(u32, u32) declare_get_random_var_wait(u64, u32) declare_get_random_var_wait(int, unsigned int) declare_get_random_var_wait(long, unsigned long) #undef declare_get_random_var /* * This is designed to be standalone for just prandom * users, but for now we include it from <linux/random.h> * for legacy reasons. */ #include <linux/prandom.h> #ifdef CONFIG_ARCH_RANDOM # include <asm/archrandom.h> #else static inline bool __must_check arch_get_random_long(unsigned long *v) { return false; } static inline bool __must_check arch_get_random_int(unsigned int *v) { return false; } static inline bool __must_check arch_get_random_seed_long(unsigned long *v) { return false; } static inline bool __must_check arch_get_random_seed_int(unsigned int *v) { return false; } #endif /* * Called from the boot CPU during startup; not valid to call once * secondary CPUs are up and preemption is possible. */ #ifndef arch_get_random_seed_long_early static inline bool __init arch_get_random_seed_long_early(unsigned long *v) { WARN_ON(system_state != SYSTEM_BOOTING); return arch_get_random_seed_long(v); } #endif #ifndef arch_get_random_long_early static inline bool __init arch_get_random_long_early(unsigned long *v) { WARN_ON(system_state != SYSTEM_BOOTING); return arch_get_random_long(v); } #endif #ifdef CONFIG_SMP int random_prepare_cpu(unsigned int cpu); int random_online_cpu(unsigned int cpu); #endif #ifndef MODULE extern const struct file_operations random_fops, urandom_fops; #endif #endif /* _LINUX_RANDOM_H */
1364 2 1357 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 // SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * * (C) Copyright 1991-2000 Linus Torvalds * * We have a per-user structure to keep track of how many * processes, files etc the user has claimed, in order to be * able to have per-user limits for system resources. */ #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/key.h> #include <linux/sched/user.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? */ struct user_namespace init_user_ns = { .uid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .gid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .projid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .ns.count = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif }; EXPORT_SYMBOL_GPL(init_user_ns); /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is * occasionally also taken from softirq/tasklet context, when * task-structs get RCU-freed. Hence all locking must be softirq-safe. * But free_uid() is also called with local interrupts disabled, and running * local_bh_enable() with local interrupts disabled is an error - we'll run * softirq callbacks, and they can unconditionally enable interrupts, and * the caller of free_uid() didn't expect that.. */ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* * These routines must be called with the uidhash spinlock held! */ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { hlist_add_head(&up->uidhash_node, hashent); } static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); } static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { refcount_inc(&user->__count); return user; } } return NULL; } static int user_epoll_alloc(struct user_struct *up) { #ifdef CONFIG_EPOLL return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); #else return 0; #endif } static void user_epoll_free(struct user_struct *up) { #ifdef CONFIG_EPOLL percpu_counter_destroy(&up->epoll_watches); #endif } /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); user_epoll_free(up); kmem_cache_free(uid_cachep, up); } /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). * * If the user_struct could not be found, return NULL. */ struct user_struct *find_user(kuid_t uid) { struct user_struct *ret; unsigned long flags; spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } void free_uid(struct user_struct *up) { unsigned long flags; if (!up) return; if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); } struct user_struct *alloc_uid(kuid_t uid) { struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; refcount_set(&new->__count, 1); if (user_epoll_alloc(new)) { kmem_cache_free(uid_cachep, new); return NULL; } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced * on adding the same user already.. */ spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); up = new; } spin_unlock_irq(&uidhash_lock); } return up; } static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); if (user_epoll_alloc(&root_user)) panic("root_user epoll percpu counter alloc failed"); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); spin_unlock_irq(&uidhash_lock); return 0; } subsys_initcall(uid_cache_init);
50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle incoming frames * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netfilter_bridge.h> #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE #include <net/netfilter/nf_queue.h> #endif #include <linux/neighbour.h> #include <net/arp.h> #include <net/dsa.h> #include <linux/export.h> #include <linux/rculist.h> #include "br_private.h" #include "br_private_tunnel.h" static int br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { br_drop_fake_rtable(skb); return netif_receive_skb(skb); } static int br_pass_frame_up(struct sk_buff *skb) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); struct net_bridge_vlan_group *vg; dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); /* Reset the offload_fwd_mark because there could be a stacked * bridge above, and it should not think this bridge it doing * that bridge's work forwarding out its ports. */ br_switchdev_frame_unmark(skb); /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc mode when someone * may be running packet capture. */ if (!(brdev->flags & IFF_PROMISC) && !br_allowed_egress(vg, skb)) { kfree_skb(skb); return NET_RX_DROP; } indev = skb->dev; skb->dev = brdev; skb = br_handle_vlan(br, NULL, vg, skb); if (!skb) return NET_RX_DROP; /* update the multicast stats if the packet is IGMP/MLD */ br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); } /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; u16 vid = 0; u8 state; if (!p || p->state == BR_STATE_DISABLED) goto drop; brmctx = &p->br->multicast_ctx; pmctx = &p->multicast_ctx; state = p->state; if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, &state, &vlan)) goto out; nbp_switchdev_frame_mark(p, skb); /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); local_rcv = !!(br->dev->flags & IFF_PROMISC); if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { pkt_type = BR_PKT_BROADCAST; local_rcv = true; } else { pkt_type = BR_PKT_MULTICAST; if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid)) goto drop; } } if (state == BR_STATE_LEARNING) goto drop; BR_INPUT_SKB_CB(skb)->brdev = br->dev; BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); if (IS_ENABLED(CONFIG_INET) && (skb->protocol == htons(ETH_P_ARP) || skb->protocol == htons(ETH_P_RARP))) { br_do_proxy_suppress_arp(skb, br, vid, p); } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *msg, _msg; msg = br_is_nd_neigh_msg(skb, &_msg); if (msg) br_do_suppress_nd(skb, br, vid, p, msg); } switch (pkt_type) { case BR_PKT_MULTICAST: mdst = br_mdb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || br_multicast_is_router(brmctx, skb)) { local_rcv = true; br->dev->stats.multicast++; } mcast_hit = true; } else { local_rcv = true; br->dev->stats.multicast++; } break; case BR_PKT_UNICAST: dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); break; default: break; } if (dst) { unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb); if (now != dst->used) dst->used = now; br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) br_flood(br, skb, pkt_type, local_rcv, false); else br_multicast_flood(mdst, skb, brmctx, local_rcv, false); } if (local_rcv) return br_pass_frame_up(skb); out: return 0; drop: kfree_skb(skb); goto out; } EXPORT_SYMBOL_GPL(br_handle_frame_finish); static void __br_handle_local_finish(struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; /* check if vlan is allowed, to avoid spoofing */ if ((p->flags & BR_LEARNING) && nbp_state_should_learn(p) && !br_opt_get(p->br, BROPT_NO_LL_LEARN) && br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0); } /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { __br_handle_local_finish(skb); /* return 1 to signal the okfn() was called so it's ok to use the skb */ return 1; } static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE struct nf_hook_entries *e = NULL; struct nf_hook_state state; unsigned int verdict, i; struct net *net; int ret; net = dev_net(skb->dev); #ifdef HAVE_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING])) goto frame_finish; #endif e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); if (!e) goto frame_finish; nf_hook_state_init(&state, NF_BR_PRE_ROUTING, NFPROTO_BRIDGE, skb->dev, NULL, NULL, net, br_handle_frame_finish); for (i = 0; i < e->num_hook_entries; i++) { verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state); switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) { *pskb = skb; return RX_HANDLER_PASS; } break; case NF_DROP: kfree_skb(skb); return RX_HANDLER_CONSUMED; case NF_QUEUE: ret = nf_queue(skb, &state, i, verdict); if (ret == 1) continue; return RX_HANDLER_CONSUMED; default: /* STOLEN */ return RX_HANDLER_CONSUMED; } } frame_finish: net = dev_net(skb->dev); br_handle_frame_finish(net, NULL, skb); #else br_handle_frame_finish(dev_net(skb->dev), NULL, skb); #endif return RX_HANDLER_CONSUMED; } /* Return 0 if the frame was not processed otherwise 1 * note: already called with rcu_read_lock */ static int br_process_frame_type(struct net_bridge_port *p, struct sk_buff *skb) { struct br_frame_type *tmp; hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list) if (unlikely(tmp->type == skb->protocol)) return tmp->frame_handler(p, skb); return 0; } /* * Return NULL if skb is handled * note: already called with rcu_read_lock */ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto drop; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); p = br_port_get_rcu(skb->dev); if (p->flags & BR_VLAN_TUNNEL) br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p)); if (unlikely(is_link_local_ether_addr(dest))) { u16 fwd_mask = p->br->group_fwd_mask_required; /* * See IEEE 802.1D Table 7-10 Reserved addresses * * Assignment Value * Bridge Group Address 01-80-C2-00-00-00 * (MAC Control) 802.3 01-80-C2-00-00-01 * (Link Aggregation) 802.3 01-80-C2-00-00-02 * 802.1X PAE address 01-80-C2-00-00-03 * * 802.1AB LLDP 01-80-C2-00-00-0E * * Others reserved for future standardization */ fwd_mask |= p->group_fwd_mask; switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, then must forward to keep loop detection */ if (p->br->stp_enabled == BR_NO_STP || fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; case 0x01: /* IEEE MAC (Pause) */ goto drop; case 0x0E: /* 802.1AB LLDP */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; default: /* Allow selective forwarding for most other protocols */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; } /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) * Thus return 1 from the okfn() to signal the skb is ok to pass */ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev), NULL, skb, skb->dev, NULL, br_handle_local_finish) == 1) { return RX_HANDLER_PASS; } else { return RX_HANDLER_CONSUMED; } } if (unlikely(br_process_frame_type(p, skb))) return RX_HANDLER_PASS; forward: switch (p->state) { case BR_STATE_FORWARDING: case BR_STATE_LEARNING: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; return nf_hook_bridge_pre(skb, pskb); default: drop: kfree_skb(skb); } return RX_HANDLER_CONSUMED; } /* This function has no purpose other than to appease the br_port_get_rcu/rtnl * helpers which identify bridged ports according to the rx_handler installed * on them (so there _needs_ to be a bridge rx_handler even if we don't need it * to do anything useful). This bridge won't support traffic to/from the stack, * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal * frames from the ETH_P_XDSA packet_type handler. */ static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb) { return RX_HANDLER_PASS; } rx_handler_func_t *br_get_rx_handler(const struct net_device *dev) { if (netdev_uses_dsa(dev)) return br_handle_frame_dummy; return br_handle_frame; } void br_add_frame(struct net_bridge *br, struct br_frame_type *ft) { hlist_add_head_rcu(&ft->list, &br->frame_type_list); } void br_del_frame(struct net_bridge *br, struct br_frame_type *ft) { struct br_frame_type *tmp; hlist_for_each_entry(tmp, &br->frame_type_list, list) if (ft == tmp) { hlist_del_rcu(&ft->list); return; } }
8 8 15 1 2 2 10 1 7 2 1 1 4 4 2 2 4 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> #include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> #include <linux/irq_work.h> #include <linux/btf_ids.h> #include <linux/buildid.h> #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; u64 data[]; }; struct bpf_stack_map { struct bpf_map map; void *elems; struct pcpu_freelist freelist; u32 n_buckets; struct stack_map_bucket *buckets[]; }; /* irq_work to run up_read() for build_id lookup in nmi context */ struct stack_map_irq_work { struct irq_work irq_work; struct mm_struct *mm; }; static void do_up_read(struct irq_work *entry) { struct stack_map_irq_work *work; if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) return; work = container_of(entry, struct stack_map_irq_work, irq_work); mmap_read_unlock_non_owner(work->mm); } static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); } static inline int stack_map_data_size(struct bpf_map *map) { return stack_map_use_build_id(map) ? sizeof(struct bpf_stack_build_id) : sizeof(u64); } static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u64 elem_size = sizeof(struct stack_map_bucket) + (u64)smap->map.value_size; int err; smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, smap->map.numa_node); if (!smap->elems) return -ENOMEM; err = pcpu_freelist_init(&smap->freelist); if (err) goto free_elems; pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, smap->map.max_entries); return 0; free_elems: bpf_map_area_free(smap->elems); return err; } /* Called from syscall */ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; u64 cost, n_buckets; int err; if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8) return ERR_PTR(-EINVAL); BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); if (attr->map_flags & BPF_F_STACK_BUILD_ID) { if (value_size % sizeof(struct bpf_stack_build_id) || value_size / sizeof(struct bpf_stack_build_id) > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ n_buckets = roundup_pow_of_two(attr->max_entries); if (!n_buckets) return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; smap->n_buckets = n_buckets; err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) goto free_smap; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; return &smap->map; put_buffers: put_callchain_buffers(); free_smap: bpf_map_area_free(smap); return ERR_PTR(err); } static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; struct vm_area_struct *vma; bool irq_work_busy = false; struct stack_map_irq_work *work = NULL; if (irqs_disabled()) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { work = this_cpu_ptr(&up_read_work); if (irq_work_is_busy(&work->irq_work)) { /* cannot queue more up_read, fallback */ irq_work_busy = true; } } else { /* * PREEMPT_RT does not allow to trylock mmap sem in * interrupt disabled context. Force the fallback code. */ irq_work_busy = true; } } /* * We cannot do up_read() when the irq is disabled, because of * risk to deadlock with rq_lock. To do build_id lookup when the * irqs are disabled, we need to run up_read() in irq_work. We use * a percpu variable to do the irq_work. If the irq_work is * already used by another lookup, we fall back to report ips. * * Same fallback is used for kernel stack (!user) on a stackmap * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } if (!work) { mmap_read_unlock(current->mm); } else { work->mm = current->mm; /* The lock will be released once we're out of interrupt * context. Tell lockdep that we've released it now so * it doesn't complain that we forgot to release it. */ rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } static struct perf_callchain_entry * get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) { #ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; int rctx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, max_depth, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is * necessary to fix this mismatch. */ if (__BITS_PER_LONG != 64) { unsigned long *from = (unsigned long *) entry->ip; u64 *to = entry->ip; int i; /* copy data from the end to avoid using extra buffer */ for (i = entry->nr - 1; i >= 0; i--) to[i] = (u64)(from[i]); } put_callchain_entry(rctx); return entry; #else /* CONFIG_STACKTRACE */ return NULL; #endif } static long __bpf_get_stackid(struct bpf_map *map, struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; if (trace->nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; trace_nr = trace->nr - skip; trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); hash_matches = bucket && bucket->hash == hash; /* fast cmp */ if (hash_matches && flags & BPF_F_FAST_STACK_CMP) return id; if (stack_map_use_build_id(map)) { /* for build_id+offset, pop a bucket before slow cmp */ new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; new_bucket->nr = trace_nr; stack_map_get_build_id_offset( (struct bpf_stack_build_id *)new_bucket->data, ips, trace_nr, user); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; } if (bucket && !(flags & BPF_F_REUSE_STACKID)) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return -EEXIST; } } else { if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, ips, trace_len) == 0) return id; if (bucket && !(flags & BPF_F_REUSE_STACKID)) return -EEXIST; new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; memcpy(new_bucket->data, ips, trace_len); } new_bucket->hash = hash; new_bucket->nr = trace_nr; old_bucket = xchg(&smap->buckets[id], new_bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return id; } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; max_depth += skip; if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ return -EFAULT; return __bpf_get_stackid(map, trace, flags); } const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static __u64 count_kernel_ip(struct perf_callchain_entry *trace) { __u64 nr_kernel = 0; while (nr_kernel < trace->nr) { if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) break; nr_kernel++; } return nr_kernel; } BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, struct bpf_map *, map, u64, flags) { struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; __u64 nr_kernel; int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; user = flags & BPF_F_USER_STACK; kernel = !user; trace = ctx->data->callchain; if (unlikely(!trace)) return -EFAULT; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; ret = __bpf_get_stackid(map, trace, flags); /* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) return -EFAULT; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; ret = __bpf_get_stackid(map, trace, flags); } return ret; } const struct bpf_func_proto bpf_get_stackid_proto_pe = { .func = bpf_get_stackid_pe, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; int err = -EINVAL; u64 *ips; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; if (kernel && user_build_id) goto clear; elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) : sizeof(u64); if (unlikely(size % elem_size)) goto clear; /* cannot get valid user stack for task without user_mode regs */ if (task && user && !user_mode(regs)) goto err_fault; num_elem = size / elem_size; max_depth = num_elem + skip; if (sysctl_perf_event_max_stack < max_depth) max_depth = sysctl_perf_event_max_stack; if (trace_in) trace = trace_in; else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) goto err_fault; if (trace->nr < skip) goto err_fault; trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; if (user && user_build_id) stack_map_get_build_id_offset(buf, ips, trace_nr, user); else memcpy(buf, ips, copy_len); if (size > copy_len) memset(buf + copy_len, 0, size - copy_len); return copy_len; err_fault: err = -EFAULT; clear: memset(buf, 0, size); return err; } BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); } const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { struct pt_regs *regs; long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); if (regs) res = __bpf_get_stack(regs, task, NULL, buf, size, flags); put_task_stack(task); return res; } const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_task_struct_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { struct pt_regs *regs = (struct pt_regs *)(ctx->regs); struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; int err = -EINVAL; __u64 nr_kernel; if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; user = flags & BPF_F_USER_STACK; kernel = !user; err = -EFAULT; trace = ctx->data->callchain; if (unlikely(!trace)) goto clear; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); /* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); } return err; clear: memset(buf, 0, size); return err; } const struct bpf_func_proto bpf_get_stack_proto_pe = { .func = bpf_get_stack_pe, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } /* Called from syscall */ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *old_bucket; u32 id = *(u32 *)key, trace_len; if (unlikely(id >= smap->n_buckets)) return -ENOENT; bucket = xchg(&smap->buckets[id], NULL); if (!bucket) return -ENOENT; trace_len = bucket->nr * stack_map_data_size(map); memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); u32 id; WARN_ON_ONCE(!rcu_read_lock_held()); if (!key) { id = 0; } else { id = *(u32 *)key; if (id >= smap->n_buckets || !smap->buckets[id]) id = 0; else id++; } while (id < smap->n_buckets && !smap->buckets[id]) id++; if (id >= smap->n_buckets) return -ENOENT; *(u32 *)next_key = id; return 0; } static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ static int stack_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *old_bucket; u32 id = *(u32 *)key; if (unlikely(id >= smap->n_buckets)) return -E2BIG; old_bucket = xchg(&smap->buckets[id], NULL); if (old_bucket) { pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } else { return -ENOENT; } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); bpf_map_area_free(smap); put_callchain_buffers(); } static int stack_trace_map_btf_id; const struct bpf_map_ops stack_trace_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_stack_map", .map_btf_id = &stack_trace_map_btf_id, }; static int __init stack_map_init(void) { int cpu; struct stack_map_irq_work *work; for_each_possible_cpu(cpu) { work = per_cpu_ptr(&up_read_work, cpu); init_irq_work(&work->irq_work, do_up_read); } return 0; } subsys_initcall(stack_map_init);
2395 2398 2397 2398 2398 2399 2398 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 // SPDX-License-Identifier: GPL-2.0 /* * SHA1 routine optimized to do word accesses rather than byte accesses, * and to avoid unnecessary copies into the context array. * * This was based on the git SHA1 implementation. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/bitops.h> #include <linux/string.h> #include <crypto/sha1.h> #include <asm/unaligned.h> /* * If you have 32 registers or more, the compiler can (and should) * try to change the array[] accesses into registers. However, on * machines with less than ~25 registers, that won't really work, * and at least gcc will make an unholy mess of it. * * So to avoid that mess which just slows things down, we force * the stores to memory to actually happen (we might be better off * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as * suggested by Artur Skawina - that will also make gcc unable to * try to do the silly "optimize away loads" part because it won't * see what the value will be). * * Ben Herrenschmidt reports that on PPC, the C version comes close * to the optimized asm with this (ie on PPC you don't want that * 'volatile', since there are lots of registers). * * On ARM we get the best code generation by forcing a full memory barrier * between each SHA_ROUND, otherwise gcc happily get wild with spilling and * the stack frame size simply explode and performance goes down the drain. */ #ifdef CONFIG_X86 #define setW(x, val) (*(volatile __u32 *)&W(x) = (val)) #elif defined(CONFIG_ARM) #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) #else #define setW(x, val) (W(x) = (val)) #endif /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) /* * Where do we get the source from? The first 16 iterations get it from * the input data, the next mix it from the 512-bit array. */ #define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t) #define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ __u32 TEMP = input(t); setW(t, TEMP); \ E += TEMP + rol32(A,5) + (fn) + (constant); \ B = ror32(B, 2); \ TEMP = E; E = D; D = C; C = B; B = A; A = TEMP; } while (0) #define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E ) #define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) #define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) /** * sha1_transform - single block SHA1 transform (deprecated) * * @digest: 160 bit digest to update * @data: 512 bits of data to hash * @array: 16 words of workspace (see note) * * This function executes SHA-1's internal compression function. It updates the * 160-bit internal state (@digest) with a single 512-bit data block (@data). * * Don't use this function. SHA-1 is no longer considered secure. And even if * you do have to use SHA-1, this isn't the correct way to hash something with * SHA-1 as this doesn't handle padding and finalization. * * Note: If the hash is security sensitive, the caller should be sure * to clear the workspace. This is left to the caller to avoid * unnecessary clears between chained hashing operations. */ void sha1_transform(__u32 *digest, const char *data, __u32 *array) { __u32 A, B, C, D, E; unsigned int i = 0; A = digest[0]; B = digest[1]; C = digest[2]; D = digest[3]; E = digest[4]; /* Round 1 - iterations 0-16 take their input from 'data' */ for (; i < 16; ++i) T_0_15(i, A, B, C, D, E); /* Round 1 - tail. Input from 512-bit mixing array */ for (; i < 20; ++i) T_16_19(i, A, B, C, D, E); /* Round 2 */ for (; i < 40; ++i) T_20_39(i, A, B, C, D, E); /* Round 3 */ for (; i < 60; ++i) T_40_59(i, A, B, C, D, E); /* Round 4 */ for (; i < 80; ++i) T_60_79(i, A, B, C, D, E); digest[0] += A; digest[1] += B; digest[2] += C; digest[3] += D; digest[4] += E; } EXPORT_SYMBOL(sha1_transform); /** * sha1_init - initialize the vectors for a SHA1 digest * @buf: vector to initialize */ void sha1_init(__u32 *buf) { buf[0] = 0x67452301; buf[1] = 0xefcdab89; buf[2] = 0x98badcfe; buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } EXPORT_SYMBOL(sha1_init);
67 67 67 67 67 67 67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 // SPDX-License-Identifier: GPL-2.0-or-later /* Provide a way to create a superblock configuration context within the kernel * that allows a superblock to be set up prior to mounting. * * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/magic.h> #include <linux/security.h> #include <linux/mnt_namespace.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <net/net_namespace.h> #include <asm/sections.h> #include "mount.h" #include "internal.h" enum legacy_fs_param { LEGACY_FS_UNSET_PARAMS, LEGACY_FS_MONOLITHIC_PARAMS, LEGACY_FS_INDIVIDUAL_PARAMS, }; struct legacy_fs_context { char *legacy_data; /* Data page for legacy filesystems */ size_t data_size; enum legacy_fs_param param_type; }; static int legacy_init_fs_context(struct fs_context *fc); static const struct constant_table common_set_sb_flag[] = { { "dirsync", SB_DIRSYNC }, { "lazytime", SB_LAZYTIME }, { "mand", SB_MANDLOCK }, { "ro", SB_RDONLY }, { "sync", SB_SYNCHRONOUS }, { }, }; static const struct constant_table common_clear_sb_flag[] = { { "async", SB_SYNCHRONOUS }, { "nolazytime", SB_LAZYTIME }, { "nomand", SB_MANDLOCK }, { "rw", SB_RDONLY }, { }, }; /* * Check for a common mount option that manipulates s_flags. */ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) { unsigned int token; token = lookup_constant(common_set_sb_flag, key, 0); if (token) { fc->sb_flags |= token; fc->sb_flags_mask |= token; return 0; } token = lookup_constant(common_clear_sb_flag, key, 0); if (token) { fc->sb_flags &= ~token; fc->sb_flags_mask |= token; return 0; } return -ENOPARAM; } /** * vfs_parse_fs_param_source - Handle setting "source" via parameter * @fc: The filesystem context to modify * @param: The parameter * * This is a simple helper for filesystems to verify that the "source" they * accept is sane. * * Returns 0 on success, -ENOPARAM if this is not "source" parameter, and * -EINVAL otherwise. In the event of failure, supplementary error information * is logged. */ int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param) { if (strcmp(param->key, "source") != 0) return -ENOPARAM; if (param->type != fs_value_is_string) return invalf(fc, "Non-string source"); if (fc->source) return invalf(fc, "Multiple sources"); fc->source = param->string; param->string = NULL; return 0; } EXPORT_SYMBOL(vfs_parse_fs_param_source); /** * vfs_parse_fs_param - Add a single parameter to a superblock config * @fc: The filesystem context to modify * @param: The parameter * * A single mount option in string form is applied to the filesystem context * being set up. Certain standard options (for example "ro") are translated * into flag bits without going to the filesystem. The active security module * is allowed to observe and poach options. Any other options are passed over * to the filesystem to parse. * * This may be called multiple times for a context. * * Returns 0 on success and a negative error code on failure. In the event of * failure, supplementary error information may have been set. */ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) { int ret; if (!param->key) return invalf(fc, "Unnamed parameter\n"); ret = vfs_parse_sb_flag(fc, param->key); if (ret != -ENOPARAM) return ret; ret = security_fs_context_parse_param(fc, param); if (ret != -ENOPARAM) /* Param belongs to the LSM or is disallowed by the LSM; so * don't pass to the FS. */ return ret; if (fc->ops->parse_param) { ret = fc->ops->parse_param(fc, param); if (ret != -ENOPARAM) return ret; } /* If the filesystem doesn't take any arguments, give it the * default handling of source. */ ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; return invalf(fc, "%s: Unknown parameter '%s'", fc->fs_type->name, param->key); } EXPORT_SYMBOL(vfs_parse_fs_param); /** * vfs_parse_fs_string - Convenience function to just parse a string. */ int vfs_parse_fs_string(struct fs_context *fc, const char *key, const char *value, size_t v_size) { int ret; struct fs_parameter param = { .key = key, .type = fs_value_is_flag, .size = v_size, }; if (value) { param.string = kmemdup_nul(value, v_size, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; } ret = vfs_parse_fs_param(fc, &param); kfree(param.string); return ret; } EXPORT_SYMBOL(vfs_parse_fs_string); /** * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data * @ctx: The superblock configuration to fill in. * @data: The data to parse * * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be * called from the ->monolithic_mount_data() fs_context operation. * * Returns 0 on success or the error returned by the ->parse_option() fs_context * operation on failure. */ int generic_parse_monolithic(struct fs_context *fc, void *data) { char *options = data, *key; int ret = 0; if (!options) return 0; ret = security_sb_eat_lsm_opts(options, &fc->security); if (ret) return ret; while ((key = strsep(&options, ",")) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); if (value) { if (value == key) continue; *value++ = 0; v_len = strlen(value); } ret = vfs_parse_fs_string(fc, key, value, v_len); if (ret < 0) break; } } return ret; } EXPORT_SYMBOL(generic_parse_monolithic); /** * alloc_fs_context - Create a filesystem context. * @fs_type: The filesystem type. * @reference: The dentry from which this one derives (or NULL) * @sb_flags: Filesystem/superblock flags (SB_*) * @sb_flags_mask: Applicable members of @sb_flags * @purpose: The purpose that this configuration shall be used for. * * Open a filesystem and create a mount context. The mount context is * initialised with the supplied flags and, if a submount/automount from * another superblock (referred to by @reference) is supplied, may have * parameters such as namespaces copied across from that superblock. */ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, struct dentry *reference, unsigned int sb_flags, unsigned int sb_flags_mask, enum fs_context_purpose purpose) { int (*init_fs_context)(struct fs_context *); struct fs_context *fc; int ret = -ENOMEM; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT); if (!fc) return ERR_PTR(-ENOMEM); fc->purpose = purpose; fc->sb_flags = sb_flags; fc->sb_flags_mask = sb_flags_mask; fc->fs_type = get_filesystem(fs_type); fc->cred = get_current_cred(); fc->net_ns = get_net(current->nsproxy->net_ns); fc->log.prefix = fs_type->name; mutex_init(&fc->uapi_mutex); switch (purpose) { case FS_CONTEXT_FOR_MOUNT: fc->user_ns = get_user_ns(fc->cred->user_ns); break; case FS_CONTEXT_FOR_SUBMOUNT: fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); break; case FS_CONTEXT_FOR_RECONFIGURE: atomic_inc(&reference->d_sb->s_active); fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); fc->root = dget(reference); break; } /* TODO: Make all filesystems support this unconditionally */ init_fs_context = fc->fs_type->init_fs_context; if (!init_fs_context) init_fs_context = legacy_init_fs_context; ret = init_fs_context(fc); if (ret < 0) goto err_fc; fc->need_free = true; return fc; err_fc: put_fs_context(fc); return ERR_PTR(ret); } struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, unsigned int sb_flags) { return alloc_fs_context(fs_type, NULL, sb_flags, 0, FS_CONTEXT_FOR_MOUNT); } EXPORT_SYMBOL(fs_context_for_mount); struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, unsigned int sb_flags, unsigned int sb_flags_mask) { return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags, sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE); } EXPORT_SYMBOL(fs_context_for_reconfigure); struct fs_context *fs_context_for_submount(struct file_system_type *type, struct dentry *reference) { return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); } EXPORT_SYMBOL(fs_context_for_submount); void fc_drop_locked(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; dput(fc->root); fc->root = NULL; deactivate_locked_super(sb); } static void legacy_fs_context_free(struct fs_context *fc); /** * vfs_dup_fc_config: Duplicate a filesystem context. * @src_fc: The context to copy. */ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) { struct fs_context *fc; int ret; if (!src_fc->ops->dup) return ERR_PTR(-EOPNOTSUPP); fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL); if (!fc) return ERR_PTR(-ENOMEM); mutex_init(&fc->uapi_mutex); fc->fs_private = NULL; fc->s_fs_info = NULL; fc->source = NULL; fc->security = NULL; get_filesystem(fc->fs_type); get_net(fc->net_ns); get_user_ns(fc->user_ns); get_cred(fc->cred); if (fc->log.log) refcount_inc(&fc->log.log->usage); /* Can't call put until we've called ->dup */ ret = fc->ops->dup(fc, src_fc); if (ret < 0) goto err_fc; ret = security_fs_context_dup(fc, src_fc); if (ret < 0) goto err_fc; return fc; err_fc: put_fs_context(fc); return ERR_PTR(ret); } EXPORT_SYMBOL(vfs_dup_fs_context); /** * logfc - Log a message to a filesystem context * @fc: The filesystem context to log to. * @fmt: The format of the buffer. */ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...) { va_list va; struct va_format vaf = {.fmt = fmt, .va = &va}; va_start(va, fmt); if (!log) { switch (level) { case 'w': printk(KERN_WARNING "%s%s%pV\n", prefix ? prefix : "", prefix ? ": " : "", &vaf); break; case 'e': printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "", prefix ? ": " : "", &vaf); break; default: printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "", prefix ? ": " : "", &vaf); break; } } else { unsigned int logsize = ARRAY_SIZE(log->buffer); u8 index; char *q = kasprintf(GFP_KERNEL, "%c %s%s%pV\n", level, prefix ? prefix : "", prefix ? ": " : "", &vaf); index = log->head & (logsize - 1); BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) || sizeof(log->tail) != sizeof(u8)); if ((u8)(log->head - log->tail) == logsize) { /* The buffer is full, discard the oldest message */ if (log->need_free & (1 << index)) kfree(log->buffer[index]); log->tail++; } log->buffer[index] = q ? q : "OOM: Can't store error string"; if (q) log->need_free |= 1 << index; else log->need_free &= ~(1 << index); log->head++; } va_end(va); } EXPORT_SYMBOL(logfc); /* * Free a logging structure. */ static void put_fc_log(struct fs_context *fc) { struct fc_log *log = fc->log.log; int i; if (log) { if (refcount_dec_and_test(&log->usage)) { fc->log.log = NULL; for (i = 0; i <= 7; i++) if (log->need_free & (1 << i)) kfree(log->buffer[i]); kfree(log); } } } /** * put_fs_context - Dispose of a superblock configuration context. * @fc: The context to dispose of. */ void put_fs_context(struct fs_context *fc) { struct super_block *sb; if (fc->root) { sb = fc->root->d_sb; dput(fc->root); fc->root = NULL; deactivate_super(sb); } if (fc->need_free && fc->ops && fc->ops->free) fc->ops->free(fc); security_free_mnt_opts(&fc->security); put_net(fc->net_ns); put_user_ns(fc->user_ns); put_cred(fc->cred); put_fc_log(fc); put_filesystem(fc->fs_type); kfree(fc->source); kfree(fc); } EXPORT_SYMBOL(put_fs_context); /* * Free the config for a filesystem that doesn't support fs_context. */ static void legacy_fs_context_free(struct fs_context *fc) { struct legacy_fs_context *ctx = fc->fs_private; if (ctx) { if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) kfree(ctx->legacy_data); kfree(ctx); } } /* * Duplicate a legacy config. */ static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { struct legacy_fs_context *ctx; struct legacy_fs_context *src_ctx = src_fc->fs_private; ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) { ctx->legacy_data = kmemdup(src_ctx->legacy_data, src_ctx->data_size, GFP_KERNEL); if (!ctx->legacy_data) { kfree(ctx); return -ENOMEM; } } fc->fs_private = ctx; return 0; } /* * Add a parameter to a legacy config. We build up a comma-separated list of * options. */ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct legacy_fs_context *ctx = fc->fs_private; unsigned int size = ctx->data_size; size_t len = 0; int ret; ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); switch (param->type) { case fs_value_is_string: len = 1 + param->size; fallthrough; case fs_value_is_flag: len += strlen(param->key); break; default: return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported", param->key); } if (size + len + 2 > PAGE_SIZE) return invalf(fc, "VFS: Legacy: Cumulative options too large"); if (strchr(param->key, ',') || (param->type == fs_value_is_string && memchr(param->string, ',', param->size))) return invalf(fc, "VFS: Legacy: Option '%s' contained comma", param->key); if (!ctx->legacy_data) { ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!ctx->legacy_data) return -ENOMEM; } ctx->legacy_data[size++] = ','; len = strlen(param->key); memcpy(ctx->legacy_data + size, param->key, len); size += len; if (param->type == fs_value_is_string) { ctx->legacy_data[size++] = '='; memcpy(ctx->legacy_data + size, param->string, param->size); size += param->size; } ctx->legacy_data[size] = '\0'; ctx->data_size = size; ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS; return 0; } /* * Add monolithic mount data. */ static int legacy_parse_monolithic(struct fs_context *fc, void *data) { struct legacy_fs_context *ctx = fc->fs_private; if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) { pr_warn("VFS: Can't mix monolithic and individual options\n"); return -EINVAL; } ctx->legacy_data = data; ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS; if (!ctx->legacy_data) return 0; if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA) return 0; return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security); } /* * Get a mountable root with the legacy mount command. */ static int legacy_get_tree(struct fs_context *fc) { struct legacy_fs_context *ctx = fc->fs_private; struct super_block *sb; struct dentry *root; root = fc->fs_type->mount(fc->fs_type, fc->sb_flags, fc->source, ctx->legacy_data); if (IS_ERR(root)) return PTR_ERR(root); sb = root->d_sb; BUG_ON(!sb); fc->root = root; return 0; } /* * Handle remount. */ static int legacy_reconfigure(struct fs_context *fc) { struct legacy_fs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb; if (!sb->s_op->remount_fs) return 0; return sb->s_op->remount_fs(sb, &fc->sb_flags, ctx ? ctx->legacy_data : NULL); } const struct fs_context_operations legacy_fs_context_ops = { .free = legacy_fs_context_free, .dup = legacy_fs_context_dup, .parse_param = legacy_parse_param, .parse_monolithic = legacy_parse_monolithic, .get_tree = legacy_get_tree, .reconfigure = legacy_reconfigure, }; /* * Initialise a legacy context for a filesystem that doesn't support * fs_context. */ static int legacy_init_fs_context(struct fs_context *fc) { fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT); if (!fc->fs_private) return -ENOMEM; fc->ops = &legacy_fs_context_ops; return 0; } int parse_monolithic_mount_data(struct fs_context *fc, void *data) { int (*monolithic_mount_data)(struct fs_context *, void *); monolithic_mount_data = fc->ops->parse_monolithic; if (!monolithic_mount_data) monolithic_mount_data = generic_parse_monolithic; return monolithic_mount_data(fc, data); } /* * Clean up a context after performing an action on it and put it into a state * from where it can be used to reconfigure a superblock. * * Note that here we do only the parts that can't fail; the rest is in * finish_clean_context() below and in between those fs_context is marked * FS_CONTEXT_AWAITING_RECONF. The reason for splitup is that after * successful mount or remount we need to report success to userland. * Trying to do full reinit (for the sake of possible subsequent remount) * and failing to allocate memory would've put us into a nasty situation. * So here we only discard the old state and reinitialization is left * until we actually try to reconfigure. */ void vfs_clean_context(struct fs_context *fc) { if (fc->need_free && fc->ops && fc->ops->free) fc->ops->free(fc); fc->need_free = false; fc->fs_private = NULL; fc->s_fs_info = NULL; fc->sb_flags = 0; security_free_mnt_opts(&fc->security); kfree(fc->source); fc->source = NULL; fc->purpose = FS_CONTEXT_FOR_RECONFIGURE; fc->phase = FS_CONTEXT_AWAITING_RECONF; } int finish_clean_context(struct fs_context *fc) { int error; if (fc->phase != FS_CONTEXT_AWAITING_RECONF) return 0; if (fc->fs_type->init_fs_context) error = fc->fs_type->init_fs_context(fc); else error = legacy_init_fs_context(fc); if (unlikely(error)) { fc->phase = FS_CONTEXT_FAILED; return error; } fc->need_free = true; fc->phase = FS_CONTEXT_RECONF_PARAMS; return 0; }
193 193 193 187 3 184 1 183 1 31 129 910 132 173 20 690 838 837 772 831 7 836 2 721 185 189 693 905 15 15 951 172 487 696 1025 487 695 1023 2 950 172 697 17 468 1023 1025 1 33 1 282 1025 340 342 1 341 324 644 644 266 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/open.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/tty.h> #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/rcupdate.h> #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> #include <linux/ima.h> #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> #include "internal.h" int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ if (length < 0) return -EINVAL; newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | time_attrs; if (filp) { newattrs.ia_file = filp; newattrs.ia_valid |= ATTR_FILE; } /* Remove suid, sgid, and file capabilities on truncate too */ ret = dentry_needs_remove_privs(mnt_userns, dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(mnt_userns, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { struct user_namespace *mnt_userns; struct inode *inode; long error; inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode)) return -EINVAL; error = mnt_want_write(path->mnt); if (error) goto out; mnt_userns = mnt_user_ns(path->mnt); error = inode_permission(mnt_userns, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; error = -EPERM; if (IS_APPEND(inode)) goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; error = security_path_truncate(path); if (!error) error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; } EXPORT_SYMBOL_GPL(vfs_truncate); long do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int error; if (length < 0) /* sorry, but loff_t says... */ return -EINVAL; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_truncate(&path, length); path_put(&path); } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { return do_sys_truncate(path, length); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) { return do_sys_truncate(path, length); } #endif long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode *inode; struct dentry *dentry; struct fd f; int error; error = -EINVAL; if (length < 0) goto out; error = -EBADF; f = fdget(fd); if (!f.file) goto out; /* explicitly opened as large or we are on 64-bit box */ if (f.file->f_flags & O_LARGEFILE) small = 0; dentry = f.file->f_path.dentry; inode = dentry->d_inode; error = -EINVAL; if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) goto out_putf; error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) goto out_putf; error = -EPERM; /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); error = security_path_truncate(&f.file->f_path); if (!error) error = do_truncate(file_mnt_user_ns(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: fdput(f); out: return error; } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) { return do_sys_ftruncate(fd, length, 1); } #endif /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { return do_sys_ftruncate(fd, length, 0); } #endif /* BITS_PER_LONG == 32 */ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); long ret; if (offset < 0 || len <= 0) return -EINVAL; /* Return error if mode is not supported */ if (mode & ~FALLOC_FL_SUPPORTED_MASK) return -EOPNOTSUPP; /* Punch hole and zero range are mutually exclusive */ if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; /* Punch hole must have keep size set */ if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; /* Collapse range should only be used exclusively. */ if ((mode & FALLOC_FL_COLLAPSE_RANGE) && (mode & ~FALLOC_FL_COLLAPSE_RANGE)) return -EINVAL; /* Insert range should only be used exclusively. */ if ((mode & FALLOC_FL_INSERT_RANGE) && (mode & ~FALLOC_FL_INSERT_RANGE)) return -EINVAL; /* Unshare range should only be used with allocate mode. */ if ((mode & FALLOC_FL_UNSHARE_RANGE) && (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* * We can only allow pure fallocate on append only files */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ ret = security_file_permission(file, MAY_WRITE); if (ret) return ret; if (S_ISFIFO(inode->i_mode)) return -ESPIPE; if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wrap through zero too */ if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; if (!file->f_op->fallocate) return -EOPNOTSUPP; file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); /* * Create inotify and fanotify events. * * To keep the logic simple always create events if fallocate succeeds. * This implies that events are even created if the file size remains * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. */ if (ret == 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { struct fd f = fdget(fd); int error = -EBADF; if (f.file) { error = vfs_fallocate(f.file, mode, offset, len); fdput(f); } return error; } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { return ksys_fallocate(fd, mode, offset, len); } /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ static const struct cred *access_override_creds(void) { const struct cred *old_cred; struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) return NULL; override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ kuid_t root_uid = make_kuid(override_cred->user_ns, 0); if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = override_cred->cap_permitted; } /* * The new set of credentials can *only* be used in * task-synchronous circumstances, and does not need * RCU freeing, unless somebody then takes a separate * reference to it. * * NOTE! This is _only_ true because this credential * is used purely for override_creds() that installs * it as the subjective cred. Other threads will be * accessing ->real_cred, not the subjective cred. * * If somebody _does_ make a copy of this (using the * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous * cred accesses will keep things non-RCY. */ override_cred->non_rcu = 1; old_cred = override_creds(override_cred); /* override_cred() gets its own ref */ put_cred(override_cred); return old_cred; } static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; const struct cred *old_cred = NULL; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (!(flags & AT_EACCESS)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; } retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* * MAY_EXEC on regular files is denied if the fs is mounted * with the "noexec" flag. */ res = -EACCES; if (path_noexec(&path)) goto out_path_release; } res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; /* * This is a rare case where using __mnt_is_readonly() * is OK without a mnt_want/drop_write() pair. Since * no actual write to the fs is performed here, we do * not need to telegraph to that to anyone. * * By doing this, we accept that this access is * inherently racy and know that the fs may change * state before we even see this result. */ if (__mnt_is_readonly(path.mnt)) res = -EROFS; out_path_release: path_put(&path); if (retry_estale(res, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: if (old_cred) revert_creds(old_cred); return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { return do_faccessat(dfd, filename, mode, 0); } SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, int, flags) { return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { return do_faccessat(AT_FDCWD, filename, mode, 0); } SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; set_fs_pwd(current->fs, &path); dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); int error; error = -EBADF; if (!f.file) goto out; error = -ENOTDIR; if (!d_can_lookup(f.file->f_path.dentry)) goto out_putf; error = file_permission(f.file, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &f.file->f_path); out_putf: fdput(f); out: return error; } SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) goto dput_and_out; set_fs_root(current->fs, &path); error = 0; dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; struct iattr newattrs; int error; error = mnt_want_write(path->mnt); if (error) return error; retry_deleg: inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(mnt_user_ns(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path->mnt); return error; } int vfs_fchmod(struct file *file, umode_t mode) { audit_file(file); return chmod_common(&file->f_path, mode); } SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { struct fd f = fdget(fd); int err = -EBADF; if (f.file) { err = vfs_fchmod(f.file, mode); fdput(f); } return err; } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { error = chmod_common(&path, mode); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { return do_fchmodat(dfd, filename, mode); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return do_fchmodat(AT_FDCWD, filename, mode); } int chown_common(const struct path *path, uid_t user, gid_t group) { struct user_namespace *mnt_userns, *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; struct iattr newattrs; kuid_t uid; kgid_t gid; uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); mnt_userns = mnt_user_ns(path->mnt); fs_userns = i_user_ns(inode); uid = mapped_kuid_user(mnt_userns, fs_userns, uid); gid = mapped_kgid_user(mnt_userns, fs_userns, gid); retry_deleg: newattrs.ia_valid = ATTR_CTIME; if (user != (uid_t) -1) { if (!uid_valid(uid)) return -EINVAL; newattrs.ia_valid |= ATTR_UID; newattrs.ia_uid = uid; } if (group != (gid_t) -1) { if (!gid_valid(gid)) return -EINVAL; newattrs.ia_valid |= ATTR_GID; newattrs.ia_gid = gid; } inode_lock(inode); if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | setattr_should_drop_sgid(mnt_userns, inode); error = security_path_chown(path, uid, gid); if (!error) error = notify_change(mnt_userns, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) { struct path path; int error = -EINVAL; int lookup_flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); if (error) goto out_release; error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) { return do_fchownat(dfd, filename, user, group, flag); } SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, AT_SYMLINK_NOFOLLOW); } int vfs_fchown(struct file *file, uid_t user, gid_t group) { int error; error = mnt_want_write_file(file); if (error) return error; audit_file(file); error = chown_common(&file->f_path, user, group); mnt_drop_write_file(file); return error; } int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { struct fd f = fdget(fd); int error = -EBADF; if (f.file) { error = vfs_fchown(f.file, user, group); fdput(f); } return error; } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { return ksys_fchown(fd, user, group); } static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; int error; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; f->f_wb_err = filemap_sample_wb_err(f->f_mapping); f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; f->f_op = &empty_fops; return 0; } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) goto cleanup_file; error = __mnt_want_write(f->f_path.mnt); if (unlikely(error)) { put_write_access(inode); goto cleanup_file; } f->f_mode |= FMODE_WRITER; } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f); if (error) goto cleanup_all; error = break_lease(locks_inode(f), f->f_flags); if (error) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f->f_mode |= FMODE_OPENED; if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; } /* * XXX: Huge page cache doesn't support writing yet. Drop all page * cache for this file before processing writes. */ if (f->f_mode & FMODE_WRITE) { /* * Paired with smp_mb() in collapse_file() to ensure nr_thps * is up to date and the update to i_writecount by * get_write_access() is visible. Ensures subsequent insertion * of THPs into the page cache will fail. */ smp_mb(); if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; filemap_invalidate_lock(inode->i_mapping); /* * unmap_mapping_range just need to be called once * here, because the private pages is not need to be * unmapped mapping (e.g. data segment of dynamic * shared libraries here). */ unmap_mapping_range(mapping, 0, 0, 0); truncate_inode_pages(mapping, 0); filemap_invalidate_unlock(inode->i_mapping); } } return 0; cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; f->f_path.dentry = NULL; f->f_inode = NULL; return error; } /** * finish_open - finish opening a file * @file: file pointer * @dentry: pointer to dentry * @open: open callback * @opened: state of open * * This can be used to finish opening a file passed to i_op->atomic_open(). * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. * * NB: the dentry reference is _not_ consumed. If, for example, the dentry is * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)) { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; return do_dentry_open(file, d_backing_inode(dentry), open); } EXPORT_SYMBOL(finish_open); /** * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer * @dentry: dentry or NULL (as returned from ->lookup()) * * This can be used to set the result of a successful lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * * Returns "0" which must be the return value of ->atomic_open() after having * called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { file->f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); char *file_path(struct file *filp, char *buf, int buflen) { return d_path(&filp->f_path, buf, buflen); } EXPORT_SYMBOL(file_path); /** * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized * @cred: credentials to use */ int vfs_open(const struct path *path, struct file *file) { file->f_path = *path; return do_dentry_open(file, d_backing_inode(path->dentry), NULL); } struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; struct file *f; validate_creds(cred); /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } EXPORT_SYMBOL(dentry_open); struct file *open_with_fake_path(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { struct file *f = alloc_empty_file_noaccount(flags, cred); if (!IS_ERR(f)) { int error; f->f_path = *path; error = do_dentry_open(f, inode, NULL); if (error) { fput(f); f = ERR_PTR(error); } } return f; } EXPORT_SYMBOL(open_with_fake_path); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) inline struct open_how build_open_how(int flags, umode_t mode) { struct open_how how = { .flags = flags & VALID_OPEN_FLAGS, .mode = mode & S_IALLUGO, }; /* O_PATH beats everything else. */ if (how.flags & O_PATH) how.flags &= O_PATH_FLAGS; /* Modes should only be set for create-like flags. */ if (!WILL_CREATE(how.flags)) how.mode = 0; return how; } inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; u64 strip = FMODE_NONOTIFY | O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), "struct open_flags doesn't yet handle flags > 32 bits"); /* * Strip flags that either shouldn't be set by userspace like * FMODE_NONOTIFY or that aren't relevant in determining struct * open_flags like O_CLOEXEC. */ flags &= ~strip; /* * Older syscalls implicitly clear all of the invalid flags or argument * values before calling build_open_flags(), but openat2(2) checks all * of its arguments. */ if (flags & ~VALID_OPEN_FLAGS) return -EINVAL; if (how->resolve & ~VALID_RESOLVE_FLAGS) return -EINVAL; /* Scoping flags are mutually exclusive. */ if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) return -EINVAL; /* Deal with the mode. */ if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO) return -EINVAL; op->mode = how->mode | S_IFREG; } else { if (how->mode != 0) return -EINVAL; op->mode = 0; } /* * In order to ensure programs get explicit errors when trying to use * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we * have to require userspace to explicitly set it. */ if (flags & __O_TMPFILE) { if ((flags & O_TMPFILE_MASK) != O_TMPFILE) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; } if (flags & O_PATH) { /* O_PATH only permits certain other flags to be set. */ if (flags & ~O_PATH_FLAGS) return -EINVAL; acc_mode = 0; } /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's * always set instead of having to deal with possibly weird behaviour * for malicious applications setting only __O_SYNC. */ if (flags & __O_SYNC) flags |= O_DSYNC; op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ if (flags & O_TRUNC) acc_mode |= MAY_WRITE; /* Allow the LSM permission hook to distinguish append access from general write access. */ if (flags & O_APPEND) acc_mode |= MAY_APPEND; op->acc_mode = acc_mode; op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; flags |= O_NOFOLLOW; } } if (flags & O_DIRECTORY) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (how->resolve & RESOLVE_NO_XDEV) lookup_flags |= LOOKUP_NO_XDEV; if (how->resolve & RESOLVE_NO_MAGICLINKS) lookup_flags |= LOOKUP_NO_MAGICLINKS; if (how->resolve & RESOLVE_NO_SYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; if (how->resolve & RESOLVE_BENEATH) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ if (flags & (O_TRUNC | O_CREAT | O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } op->lookup_flags = lookup_flags; return 0; } /** * file_open_name - open file and return file pointer * * @name: struct filename containing path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_filp_open(AT_FDCWD, name, &op); } /** * filp_open - open file and return file pointer * * @filename: path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); } return file; } EXPORT_SYMBOL(filp_open); struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); static long do_sys_openat2(int dfd, const char __user *filename, struct open_how *how) { struct open_flags op; int fd = build_open_flags(how, &op); struct filename *tmp; if (fd) return fd; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fsnotify_open(f); fd_install(fd, f); } } putname(tmp); return fd; } long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); } SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(dfd, filename, flags, mode); } SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, struct open_how __user *, how, size_t, usize) { int err; struct open_how tmp; BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) return err; /* O_LARGEFILE is only allowed for non-O_PATH. */ if (!(tmp.flags & O_PATH) && force_o_largefile()) tmp.flags |= O_LARGEFILE; return do_sys_openat2(dfd, filename, &tmp); } #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } /* * Exactly like sys_openat(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(dfd, filename, flags, mode); } #endif #ifndef __alpha__ /* * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { int flags = O_CREAT | O_WRONLY | O_TRUNC; if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, pathname, flags, mode); } #endif /* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ int filp_close(struct file *filp, fl_owner_t id) { int retval = 0; if (!file_count(filp)) { printk(KERN_ERR "VFS: Close: file count is 0\n"); return 0; } if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id); locks_remove_posix(filp, id); } fput(filp); return retval; } EXPORT_SYMBOL(filp_close); /* * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. */ SYSCALL_DEFINE1(close, unsigned int, fd) { int retval = close_fd(fd); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)) retval = -EINTR; return retval; } /** * close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: reserved for future extensions * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { return __close_range(fd, max_fd, flags); } /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. */ SYSCALL_DEFINE0(vhangup) { if (capable(CAP_SYS_TTY_CONFIG)) { tty_vhangup_self(); return 0; } return -EPERM; } /* * Called when an inode is about to be open. * We use this to disallow opening large files on 32bit systems if * the caller didn't specify O_LARGEFILE. On 64bit systems we force * on this flag in sys_open. */ int generic_file_open(struct inode * inode, struct file * filp) { if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(generic_file_open); /* * This is used by subsystems that don't want seekable * file descriptors. The function is not supposed to ever fail, the only * reason it returns an 'int' and not 'void' is so that it can be plugged * directly into file_operations structure. */ int nonseekable_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); return 0; } EXPORT_SYMBOL(nonseekable_open); /* * stream_open is used by subsystems that want stream-like file descriptors. * Such file descriptors are not seekable and don't have notion of position * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). * Contrary to file descriptors of other regular files, .read() and .write() * can run simultaneously. * * stream_open never fails and is marked to return int so that it could be * directly used as file_operations.open . */ int stream_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); filp->f_mode |= FMODE_STREAM; return 0; } EXPORT_SYMBOL(stream_open);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the UDP protocol. * * Version: @(#)udp.h 1.0.2 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_UDP_H #define _LINUX_UDP_H #include <net/inet_sock.h> #include <linux/skbuff.h> #include <net/netns/hash.h> #include <uapi/linux/udp.h> static inline struct udphdr *udp_hdr(const struct sk_buff *skb) { return (struct udphdr *)skb_transport_header(skb); } static inline struct udphdr *inner_udp_hdr(const struct sk_buff *skb) { return (struct udphdr *)skb_inner_transport_header(skb); } #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) { return (num + net_hash_mix(net)) & mask; } struct udp_sock { /* inet_sock has to be the first member */ struct inet_sock inet; #define udp_port_hash inet.sk.__sk_common.skc_u16hashes[0] #define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1] #define udp_portaddr_node inet.sk.__sk_common.skc_portaddr_node int pending; /* Any pending frames ? */ unsigned int corkflag; /* Cork is required */ __u8 encap_type; /* Is this an Encapsulation socket? */ unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ encap_enabled:1, /* This socket enabled encap * processing; UDP tunnels and * different encapsulation layer set * this */ gro_enabled:1, /* Request GRO aggregation */ accept_udp_l4:1, accept_udp_fraglist:1; /* * Following member retains the information to create a UDP header * when the socket is uncorked. */ __u16 len; /* total length of pending frames */ __u16 gso_size; /* * Fields specific to UDP-Lite. */ __u16 pcslen; __u16 pcrlen; /* indicator bits used by pcflag: */ #define UDPLITE_BIT 0x1 /* set by udplite proto init function */ #define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ #define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */ __u8 pcflag; /* marks socket as UDP-Lite if > 0 */ __u8 unused[3]; /* * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); /* GRO functions for UDP socket */ struct sk_buff * (*gro_receive)(struct sock *sk, struct list_head *head, struct sk_buff *skb); int (*gro_complete)(struct sock *sk, struct sk_buff *skb, int nhoff); /* udp_recvmsg try to use this before splicing sk_receive_queue */ struct sk_buff_head reader_queue ____cacheline_aligned_in_smp; /* This field is dirtied by udp_recvmsg() */ int forward_deficit; }; #define UDP_MAX_SEGMENTS (1 << 6UL) static inline struct udp_sock *udp_sk(const struct sock *sk) { return (struct udp_sock *)sk; } static inline void udp_set_no_check6_tx(struct sock *sk, bool val) { udp_sk(sk)->no_check6_tx = val; } static inline void udp_set_no_check6_rx(struct sock *sk, bool val) { udp_sk(sk)->no_check6_rx = val; } static inline bool udp_get_no_check6_tx(struct sock *sk) { return udp_sk(sk)->no_check6_tx; } static inline bool udp_get_no_check6_rx(struct sock *sk) { return udp_sk(sk)->no_check6_rx; } static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int gso_size; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { gso_size = skb_shinfo(skb)->gso_size; put_cmsg(msg, SOL_UDP, UDP_GRO, sizeof(gso_size), &gso_size); } } static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) { if (!skb_is_gso(skb)) return false; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4) return true; if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist) return true; return false; } static inline void udp_allow_gso(struct sock *sk) { udp_sk(sk)->accept_udp_l4 = 1; udp_sk(sk)->accept_udp_fraglist = 1; } #define udp_portaddr_for_each_entry(__sk, list) \ hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node) #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) #endif /* _LINUX_UDP_H */
1085 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/skbuff.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/rtnetlink.h> #include "datapath.h" #include "vport-internal_dev.h" #include "vport-netdev.h" struct internal_dev { struct vport *vport; }; static struct vport_ops ovs_internal_vport_ops; static struct internal_dev *internal_dev_priv(struct net_device *netdev) { return netdev_priv(netdev); } /* Called with rcu_read_lock_bh. */ static netdev_tx_t internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { int len, err; /* store len value because skb can be freed inside ovs_vport_receive() */ len = skb->len; rcu_read_lock(); err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); if (likely(!err)) dev_sw_netstats_tx_add(netdev, 1, len); else netdev->stats.tx_errors++; return NETDEV_TX_OK; } static int internal_dev_open(struct net_device *netdev) { netif_start_queue(netdev); return 0; } static int internal_dev_stop(struct net_device *netdev) { netif_stop_queue(netdev); return 0; } static void internal_dev_getinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { strlcpy(info->driver, "openvswitch", sizeof(info->driver)); } static const struct ethtool_ops internal_dev_ethtool_ops = { .get_drvinfo = internal_dev_getinfo, .get_link = ethtool_op_get_link, }; static void internal_dev_destructor(struct net_device *dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); ovs_vport_free(vport); } static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_get_stats64 = dev_get_tstats64, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { .kind = "openvswitch", }; static void do_setup(struct net_device *netdev) { ether_setup(netdev); netdev->max_mtu = ETH_MAX_MTU; netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | IFF_NO_QUEUE; netdev->needs_free_netdev = true; netdev->priv_destructor = NULL; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; netdev->vlan_features = netdev->features; netdev->hw_enc_features = netdev->features; netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; netdev->hw_features = netdev->features & ~NETIF_F_LLTX; eth_hw_addr_random(netdev); } static struct vport *internal_dev_create(const struct vport_parms *parms) { struct vport *vport; struct internal_dev *internal_dev; struct net_device *dev; int err; vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); goto error; } dev = alloc_netdev(sizeof(struct internal_dev), parms->name, NET_NAME_USER, do_setup); vport->dev = dev; if (!vport->dev) { err = -ENOMEM; goto error_free_vport; } vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vport->dev->tstats) { err = -ENOMEM; goto error_free_netdev; } dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) vport->dev->features |= NETIF_F_NETNS_LOCAL; rtnl_lock(); err = register_netdevice(vport->dev); if (err) goto error_unlock; vport->dev->priv_destructor = internal_dev_destructor; dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); netif_start_queue(vport->dev); return vport; error_unlock: rtnl_unlock(); free_percpu(dev->tstats); error_free_netdev: free_netdev(dev); error_free_vport: ovs_vport_free(vport); error: return ERR_PTR(err); } static void internal_dev_destroy(struct vport *vport) { netif_stop_queue(vport->dev); rtnl_lock(); dev_set_promiscuity(vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ unregister_netdevice(vport->dev); free_percpu(vport->dev->tstats); rtnl_unlock(); } static netdev_tx_t internal_dev_recv(struct sk_buff *skb) { struct net_device *netdev = skb->dev; if (unlikely(!(netdev->flags & IFF_UP))) { kfree_skb(skb); netdev->stats.rx_dropped++; return NETDEV_TX_OK; } skb_dst_drop(skb); nf_reset_ct(skb); secpath_reset(skb); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); dev_sw_netstats_rx_add(netdev, skb->len); netif_rx(skb); return NETDEV_TX_OK; } static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, .send = internal_dev_recv, }; int ovs_is_internal_dev(const struct net_device *netdev) { return netdev->netdev_ops == &internal_dev_netdev_ops; } struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) { if (!ovs_is_internal_dev(netdev)) return NULL; return internal_dev_priv(netdev)->vport; } int ovs_internal_dev_rtnl_link_register(void) { int err; err = rtnl_link_register(&internal_dev_link_ops); if (err < 0) return err; err = ovs_vport_ops_register(&ovs_internal_vport_ops); if (err < 0) rtnl_link_unregister(&internal_dev_link_ops); return err; } void ovs_internal_dev_rtnl_link_unregister(void) { ovs_vport_ops_unregister(&ovs_internal_vport_ops); rtnl_link_unregister(&internal_dev_link_ops); }
1 1 1 3 3 2 2 2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> #include <net/dst_metadata.h> #include <net/net_namespace.h> #include <net/udp.h> #include <net/udp_tunnel.h> int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { int err; struct socket *sock = NULL; struct sockaddr_in udp_addr; err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; if (cfg->bind_ifindex) { err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr *)&udp_addr, sizeof(udp_addr)); if (err < 0) goto error; if (cfg->peer_udp_port) { udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->peer_ip; udp_addr.sin_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr *)&udp_addr, sizeof(udp_addr), 0); if (err < 0) goto error; } sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; *sockp = sock; return 0; error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL(udp_sock_create4); void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { struct sock *sk = sock->sk; /* Disable multicast loopback */ inet_sk(sk)->mc_loop = 0; /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ inet_inc_convert_csum(sk); rcu_assign_sk_user_data(sk, cfg->sk_user_data); udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv; udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sock); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; udp_tunnel_nic_add_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; udp_tunnel_nic_del_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); /* Notify netdevs that UDP port started listening */ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct udp_tunnel_info ti; struct net_device *dev; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; rcu_read_lock(); for_each_netdev_rcu(net, dev) { udp_tunnel_nic_add_port(dev, &ti); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); /* Notify netdevs that UDP port is no more listening */ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct udp_tunnel_info ti; struct net_device *dev; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; rcu_read_lock(); for_each_netdev_rcu(net, dev) { udp_tunnel_nic_del_port(dev, &ti); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck) { struct udphdr *uh; __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); udp_set_csum(nocheck, skb, src, dst, skb->len); iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); synchronize_rcu(); kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, __be16 flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; if (family == AF_INET) tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); else tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; if (udp_hdr(skb)->check) info->key.tun_flags |= TUNNEL_CSUM; return tun_dst; } EXPORT_SYMBOL_GPL(udp_tun_rx_dst); MODULE_LICENSE("GPL");
67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/util.c * Copyright (C) 1992 Krishna Balasubramanian * * Sep 1997 - Call suser() last after "normal" permission checks so we * get BSD style process accounting right. * Occurs in several places in the IPC code. * Chris Evans, <chris@ferret.lmh.ox.ac.uk> * Nov 1999 - ipc helper functions, unified SMP locking * Manfred Spraul <manfred@colorfullife.com> * Oct 2002 - One lock per IPC id. RCU ipc_free for lock-free grow_ary(). * Mingming Cao <cmm@us.ibm.com> * Mar 2006 - support for audit of ipc object properties * Dustin Kirkland <dustin.kirkland@us.ibm.com> * Jun 2006 - namespaces ssupport * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> * * General sysv ipc locking scheme: * rcu_read_lock() * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr * tree. * - perform initial checks (capabilities, auditing and permission, * etc). * - perform read-only operations, such as INFO command, that * do not demand atomicity * acquire the ipc lock (kern_ipc_perm.lock) through * ipc_lock_object() * - perform read-only operations that demand atomicity, * such as STAT command. * - perform data updates, such as SET, RMID commands and * mechanism-specific operations (semop/semtimedop, * msgsnd/msgrcv, shmat/shmdt). * drop the ipc lock, through ipc_unlock_object(). * rcu_read_unlock() * * The ids->rwsem must be taken when: * - creating, removing and iterating the existing entries in ipc * identifier sets. * - iterating through files under /proc/sysvipc/ * * Note that sems have a special fast path that avoids kern_ipc_perm.lock - * see sem_lock(). */ #include <linux/mm.h> #include <linux/shm.h> #include <linux/init.h> #include <linux/msg.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/capability.h> #include <linux/highuid.h> #include <linux/security.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/audit.h> #include <linux/nsproxy.h> #include <linux/rwsem.h> #include <linux/memory.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> #include <linux/log2.h> #include <asm/unistd.h> #include "util.h" struct ipc_proc_iface { const char *path; const char *header; int ids; int (*show)(struct seq_file *, void *); }; /** * ipc_init - initialise ipc subsystem * * The various sysv ipc resources (semaphores, messages and shared * memory) are initialised. * * A callback routine is registered into the memory hotplug notifier * chain: since msgmni scales to lowmem this callback routine will be * called upon successful memory add / remove to recompute msmgni. */ static int __init ipc_init(void) { proc_mkdir("sysvipc", NULL); sem_init(); msg_init(); shm_init(); return 0; } device_initcall(ipc_init); static const struct rhashtable_params ipc_kht_params = { .head_offset = offsetof(struct kern_ipc_perm, khtnode), .key_offset = offsetof(struct kern_ipc_perm, key), .key_len = sizeof_field(struct kern_ipc_perm, key), .automatic_shrinking = true, }; /** * ipc_init_ids - initialise ipc identifiers * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited * below ipc_mni) then initialise the keys hashtable and ids idr. */ void ipc_init_ids(struct ipc_ids *ids) { ids->in_use = 0; ids->seq = 0; init_rwsem(&ids->rwsem); rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); ids->max_idx = -1; ids->last_idx = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif } #ifdef CONFIG_PROC_FS static const struct proc_ops sysvipc_proc_ops; /** * ipc_init_proc_interface - create a proc interface for sysipc types using a seq_file interface. * @path: Path in procfs * @header: Banner to be printed at the beginning of the file. * @ids: ipc id table to iterate. * @show: show routine. */ void __init ipc_init_proc_interface(const char *path, const char *header, int ids, int (*show)(struct seq_file *, void *)) { struct proc_dir_entry *pde; struct ipc_proc_iface *iface; iface = kmalloc(sizeof(*iface), GFP_KERNEL); if (!iface) return; iface->path = path; iface->header = header; iface->ids = ids; iface->show = show; pde = proc_create_data(path, S_IRUGO, /* world readable */ NULL, /* parent dir */ &sysvipc_proc_ops, iface); if (!pde) kfree(iface); } #endif /** * ipc_findkey - find a key in an ipc identifier set * @ids: ipc identifier set * @key: key to find * * Returns the locked pointer to the ipc structure if found or NULL * otherwise. If key is found ipc points to the owning ipc structure * * Called with writer ipc_ids.rwsem held. */ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) { struct kern_ipc_perm *ipcp; ipcp = rhashtable_lookup_fast(&ids->key_ht, &key, ipc_kht_params); if (!ipcp) return NULL; rcu_read_lock(); ipc_lock_object(ipcp); return ipcp; } /* * Insert new IPC object into idr tree, and set sequence number and id * in the correct order. * Especially: * - the sequence number must be set before inserting the object into the idr, * because the sequence number is accessed without a lock. * - the id can/must be set after inserting the object into the idr. * All accesses must be done after getting kern_ipc_perm.lock. * * The caller must own kern_ipc_perm.lock.of the new object. * On error, the function returns a (negative) error code. * * To conserve sequence number space, especially with extended ipc_mni, * the sequence number is incremented only when the returned ID is less than * the last one. */ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { int idx, next_id = -1; #ifdef CONFIG_CHECKPOINT_RESTORE next_id = ids->next_id; ids->next_id = -1; #endif /* * As soon as a new object is inserted into the idr, * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it, * and the lockless preparations for ipc operations can start. * This means especially: permission checks, audit calls, allocation * of undo structures, ... * * Thus the object must be fully initialized, and if something fails, * then the full tear-down sequence must be followed. * (i.e.: set new->deleted, reduce refcount, call_rcu()) */ if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ int max_idx; max_idx = max(ids->in_use*3/2, ipc_min_cycle); max_idx = min(max_idx, ipc_mni); /* allocate the idx, with a NULL struct kern_ipc_perm */ idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx, GFP_NOWAIT); if (idx >= 0) { /* * idx got allocated successfully. * Now calculate the sequence number and set the * pointer for real. */ if (idx <= ids->last_idx) { ids->seq++; if (ids->seq >= ipcid_seq_max()) ids->seq = 0; } ids->last_idx = idx; new->seq = ids->seq; /* no need for smp_wmb(), this is done * inside idr_replace, as part of * rcu_assign_pointer */ idr_replace(&ids->ipcs_idr, new, idx); } } else { new->seq = ipcid_to_seqx(next_id); idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), 0, GFP_NOWAIT); } if (idx >= 0) new->id = (new->seq << ipcmni_seq_shift()) + idx; return idx; } /** * ipc_addid - add an ipc identifier * @ids: ipc identifier set * @new: new ipc permission set * @limit: limit for the number of used ids * * Add an entry 'new' to the ipc ids idr. The permissions object is * initialised and the first free entry is set up and the index assigned * is returned. The 'new' entry is returned in a locked state on success. * * On failure the entry is not locked and a negative err-code is returned. * The caller must use ipc_rcu_putref() to free the identifier. * * Called with writer ipc_ids.rwsem held. */ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) { kuid_t euid; kgid_t egid; int idx, err; /* 1) Initialize the refcount so that ipc_rcu_putref works */ refcount_set(&new->refcount, 1); if (limit > ipc_mni) limit = ipc_mni; if (ids->in_use >= limit) return -ENOSPC; idr_preload(GFP_KERNEL); spin_lock_init(&new->lock); rcu_read_lock(); spin_lock(&new->lock); current_euid_egid(&euid, &egid); new->cuid = new->uid = euid; new->gid = new->cgid = egid; new->deleted = false; idx = ipc_idr_alloc(ids, new); idr_preload_end(); if (idx >= 0 && new->key != IPC_PRIVATE) { err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode, ipc_kht_params); if (err < 0) { idr_remove(&ids->ipcs_idr, idx); idx = err; } } if (idx < 0) { new->deleted = true; spin_unlock(&new->lock); rcu_read_unlock(); return idx; } ids->in_use++; if (idx > ids->max_idx) ids->max_idx = idx; return idx; } /** * ipcget_new - create a new ipc object * @ns: ipc namespace * @ids: ipc identifier set * @ops: the actual creation routine to call * @params: its parameters * * This routine is called by sys_msgget, sys_semget() and sys_shmget() * when the key is IPC_PRIVATE. */ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { int err; down_write(&ids->rwsem); err = ops->getnew(ns, params); up_write(&ids->rwsem); return err; } /** * ipc_check_perms - check security and permissions for an ipc object * @ns: ipc namespace * @ipcp: ipc permission set * @ops: the actual security routine to call * @params: its parameters * * This routine is called by sys_msgget(), sys_semget() and sys_shmget() * when the key is not IPC_PRIVATE and that key already exists in the * ds IDR. * * On success, the ipc id is returned. * * It is called with ipc_ids.rwsem and ipcp->lock held. */ static int ipc_check_perms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, const struct ipc_ops *ops, struct ipc_params *params) { int err; if (ipcperms(ns, ipcp, params->flg)) err = -EACCES; else { err = ops->associate(ipcp, params->flg); if (!err) err = ipcp->id; } return err; } /** * ipcget_public - get an ipc object or create a new one * @ns: ipc namespace * @ids: ipc identifier set * @ops: the actual creation routine to call * @params: its parameters * * This routine is called by sys_msgget, sys_semget() and sys_shmget() * when the key is not IPC_PRIVATE. * It adds a new entry if the key is not found and does some permission * / security checkings if the key is found. * * On success, the ipc id is returned. */ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { struct kern_ipc_perm *ipcp; int flg = params->flg; int err; /* * Take the lock as a writer since we are potentially going to add * a new entry + read locks are not "upgradable" */ down_write(&ids->rwsem); ipcp = ipc_findkey(ids, params->key); if (ipcp == NULL) { /* key not used */ if (!(flg & IPC_CREAT)) err = -ENOENT; else err = ops->getnew(ns, params); } else { /* ipc object has been locked by ipc_findkey() */ if (flg & IPC_CREAT && flg & IPC_EXCL) err = -EEXIST; else { err = 0; if (ops->more_checks) err = ops->more_checks(ipcp, params); if (!err) /* * ipc_check_perms returns the IPC id on * success */ err = ipc_check_perms(ns, ipcp, ops, params); } ipc_unlock(ipcp); } up_write(&ids->rwsem); return err; } /** * ipc_kht_remove - remove an ipc from the key hashtable * @ids: ipc identifier set * @ipcp: ipc perm structure containing the key to remove * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { if (ipcp->key != IPC_PRIVATE) WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, ipc_kht_params)); } /** * ipc_search_maxidx - search for the highest assigned index * @ids: ipc identifier set * @limit: known upper limit for highest assigned index * * The function determines the highest assigned index in @ids. It is intended * to be called when ids->max_idx needs to be updated. * Updating ids->max_idx is necessary when the current highest index ipc * object is deleted. * If no ipc object is allocated, then -1 is returned. * * ipc_ids.rwsem needs to be held by the caller. */ static int ipc_search_maxidx(struct ipc_ids *ids, int limit) { int tmpidx; int i; int retval; i = ilog2(limit+1); retval = 0; for (; i >= 0; i--) { tmpidx = retval | (1<<i); /* * "0" is a possible index value, thus search using * e.g. 15,7,3,1,0 instead of 16,8,4,2,1. */ tmpidx = tmpidx-1; if (idr_get_next(&ids->ipcs_idr, &tmpidx)) retval |= (1<<i); } return retval - 1; } /** * ipc_rmid - remove an ipc identifier * @ids: ipc identifier set * @ipcp: ipc perm structure containing the identifier to remove * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id); WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; if (unlikely(idx == ids->max_idx)) { idx = ids->max_idx-1; if (idx >= 0) idx = ipc_search_maxidx(ids, idx); ids->max_idx = idx; } } /** * ipc_set_key_private - switch the key of an existing ipc to IPC_PRIVATE * @ids: ipc identifier set * @ipcp: ipc perm structure containing the key to modify * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { ipc_kht_remove(ids, ipcp); ipcp->key = IPC_PRIVATE; } bool ipc_rcu_getref(struct kern_ipc_perm *ptr) { return refcount_inc_not_zero(&ptr->refcount); } void ipc_rcu_putref(struct kern_ipc_perm *ptr, void (*func)(struct rcu_head *head)) { if (!refcount_dec_and_test(&ptr->refcount)) return; call_rcu(&ptr->rcu, func); } /** * ipcperms - check ipc permissions * @ns: ipc namespace * @ipcp: ipc permission set * @flag: desired permission set * * Check user, group, other permissions for access * to ipc resources. return 0 if allowed * * @flag will most probably be 0 or ``S_...UGO`` from <linux/stat.h> */ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag) { kuid_t euid = current_euid(); int requested_mode, granted_mode; audit_ipc_obj(ipcp); requested_mode = (flag >> 6) | (flag >> 3) | flag; granted_mode = ipcp->mode; if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid)) granted_mode >>= 6; else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid)) granted_mode >>= 3; /* is there some bit set in requested_mode but not in granted_mode? */ if ((requested_mode & ~granted_mode & 0007) && !ns_capable(ns->user_ns, CAP_IPC_OWNER)) return -1; return security_ipc_permission(ipcp, flag); } /* * Functions to convert between the kern_ipc_perm structure and the * old/new ipc_perm structures */ /** * kernel_to_ipc64_perm - convert kernel ipc permissions to user * @in: kernel permissions * @out: new style ipc permissions * * Turn the kernel object @in into a set of permissions descriptions * for returning to userspace (@out). */ void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out) { out->key = in->key; out->uid = from_kuid_munged(current_user_ns(), in->uid); out->gid = from_kgid_munged(current_user_ns(), in->gid); out->cuid = from_kuid_munged(current_user_ns(), in->cuid); out->cgid = from_kgid_munged(current_user_ns(), in->cgid); out->mode = in->mode; out->seq = in->seq; } /** * ipc64_perm_to_ipc_perm - convert new ipc permissions to old * @in: new style ipc permissions * @out: old style ipc permissions * * Turn the new style permissions object @in into a compatibility * object and store it into the @out pointer. */ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) { out->key = in->key; SET_UID(out->uid, in->uid); SET_GID(out->gid, in->gid); SET_UID(out->cuid, in->cuid); SET_GID(out->cgid, in->cgid); out->mode = in->mode; out->seq = in->seq; } /** * ipc_obtain_object_idr * @ids: ipc identifier set * @id: ipc id to look for * * Look for an id in the ipc ids idr and return associated ipc object. * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out; int idx = ipcid_to_idx(id); out = idr_find(&ids->ipcs_idr, idx); if (!out) return ERR_PTR(-EINVAL); return out; } /** * ipc_obtain_object_check * @ids: ipc identifier set * @id: ipc id to look for * * Similar to ipc_obtain_object_idr() but also checks the ipc object * sequence number. * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out = ipc_obtain_object_idr(ids, id); if (IS_ERR(out)) goto out; if (ipc_checkid(out, id)) return ERR_PTR(-EINVAL); out: return out; } /** * ipcget - Common sys_*get() code * @ns: namespace * @ids: ipc identifier set * @ops: operations to be called on ipc object creation, permission checks * and further checks * @params: the parameters needed by the previous operations. * * Common routine called by sys_msgget(), sys_semget() and sys_shmget(). */ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { if (params->key == IPC_PRIVATE) return ipcget_new(ns, ids, ops, params); else return ipcget_public(ns, ids, ops, params); } /** * ipc_update_perm - update the permissions of an ipc object * @in: the permission given as input. * @out: the permission of the ipc to set. */ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) { kuid_t uid = make_kuid(current_user_ns(), in->uid); kgid_t gid = make_kgid(current_user_ns(), in->gid); if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; out->uid = uid; out->gid = gid; out->mode = (out->mode & ~S_IRWXUGO) | (in->mode & S_IRWXUGO); return 0; } /** * ipcctl_obtain_check - retrieve an ipc object and check permissions * @ns: ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve * @cmd: the cmd to check * @perm: the permission to set * @extra_perm: one extra permission parameter used by msq * * This function does some common audit and permissions check for some IPC_XXX * cmd and is called from semctl_down, shmctl_down and msgctl_down. * * It: * - retrieves the ipc object with the given id in the given table. * - performs some audit and permission check, depending on the given cmd * - returns a pointer to the ipc object or otherwise, the corresponding * error. * * Call holding the both the rwsem and the rcu read lock. */ struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm) { kuid_t euid; int err = -EPERM; struct kern_ipc_perm *ipcp; ipcp = ipc_obtain_object_check(ids, id); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto err; } audit_ipc_obj(ipcp); if (cmd == IPC_SET) audit_ipc_set_perm(extra_perm, perm->uid, perm->gid, perm->mode); euid = current_euid(); if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid) || ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ipcp; /* successful lookup */ err: return ERR_PTR(err); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /** * ipc_parse_version - ipc call version * @cmd: pointer to command * * Return IPC_64 for new style IPC and IPC_OLD for old style IPC. * The @cmd value is turned from an encoding command and version into * just the command code. */ int ipc_parse_version(int *cmd) { if (*cmd & IPC_64) { *cmd ^= IPC_64; return IPC_64; } else { return IPC_OLD; } } #endif /* CONFIG_ARCH_WANT_IPC_PARSE_VERSION */ #ifdef CONFIG_PROC_FS struct ipc_proc_iter { struct ipc_namespace *ns; struct pid_namespace *pid_ns; struct ipc_proc_iface *iface; }; struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) { struct ipc_proc_iter *iter = s->private; return iter->pid_ns; } /* * This routine locks the ipc structure found at least at position pos. */ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, loff_t *new_pos) { struct kern_ipc_perm *ipc = NULL; int max_idx = ipc_get_maxidx(ids); if (max_idx == -1 || pos > max_idx) goto out; for (; pos <= max_idx; pos++) { ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { rcu_read_lock(); ipc_lock_object(ipc); break; } } out: *new_pos = pos + 1; return ipc; } static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct kern_ipc_perm *ipc = it; /* If we had an ipc id locked before, unlock it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); return sysvipc_find_ipc(&iter->ns->ids[iface->ids], *pos, pos); } /* * File positions: pos 0 -> header, pos n -> ipc id = n - 1. * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct ipc_ids *ids; ids = &iter->ns->ids[iface->ids]; /* * Take the lock - this will be released by the corresponding * call to stop(). */ down_read(&ids->rwsem); /* pos < 0 is invalid */ if (*pos < 0) return NULL; /* pos == 0 means header */ if (*pos == 0) return SEQ_START_TOKEN; /* Find the (pos-1)th ipc */ return sysvipc_find_ipc(ids, *pos - 1, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) { struct kern_ipc_perm *ipc = it; struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct ipc_ids *ids; /* If we had a locked structure, release it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); ids = &iter->ns->ids[iface->ids]; /* Release the lock we took in start() */ up_read(&ids->rwsem); } static int sysvipc_proc_show(struct seq_file *s, void *it) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; if (it == SEQ_START_TOKEN) { seq_puts(s, iface->header); return 0; } return iface->show(s, it); } static const struct seq_operations sysvipc_proc_seqops = { .start = sysvipc_proc_start, .stop = sysvipc_proc_stop, .next = sysvipc_proc_next, .show = sysvipc_proc_show, }; static int sysvipc_proc_open(struct inode *inode, struct file *file) { struct ipc_proc_iter *iter; iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter)); if (!iter) return -ENOMEM; iter->iface = PDE_DATA(inode); iter->ns = get_ipc_ns(current->nsproxy->ipc_ns); iter->pid_ns = get_pid_ns(task_active_pid_ns(current)); return 0; } static int sysvipc_proc_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct ipc_proc_iter *iter = seq->private; put_ipc_ns(iter->ns); put_pid_ns(iter->pid_ns); return seq_release_private(inode, file); } static const struct proc_ops sysvipc_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = sysvipc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = sysvipc_proc_release, }; #endif /* CONFIG_PROC_FS */
564 564 564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com> * * Based on the shift-and-subtract algorithm for computing integer * square root from Guy L. Steele. */ #include <linux/export.h> #include <linux/bitops.h> #include <linux/limits.h> #include <linux/math.h> /** * int_sqrt - computes the integer square root * @x: integer of which to calculate the sqrt * * Computes: floor(sqrt(x)) */ unsigned long int_sqrt(unsigned long x) { unsigned long b, m, y = 0; if (x <= 1) return x; m = 1UL << (__fls(x) & ~1UL); while (m != 0) { b = y + m; y >>= 1; if (x >= b) { x -= b; y += m; } m >>= 2; } return y; } EXPORT_SYMBOL(int_sqrt); #if BITS_PER_LONG < 64 /** * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input * is expected. * @x: 64bit integer of which to calculate the sqrt */ u32 int_sqrt64(u64 x) { u64 b, m, y = 0; if (x <= ULONG_MAX) return int_sqrt((unsigned long) x); m = 1ULL << ((fls64(x) - 1) & ~1ULL); while (m != 0) { b = y + m; y >>= 1; if (x >= b) { x -= b; y += m; } m >>= 2; } return y; } EXPORT_SYMBOL(int_sqrt64); #endif
3 563 1 1 1 1 1 564 564 1084 1056 167 167 564 563 564 2 2 1 2 1 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux IPv6 multicast routing support for BSD pim6sd * Based on net/ipv4/ipmr.c. * * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> * LSIIT Laboratory, Strasbourg, France * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> * 6WIND, Paris, France * Copyright (C)2007,2008 USAGI/WIDE Project * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <linux/mroute6.h> #include <linux/pim.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> #include <net/ip_tunnels.h> #include <linux/nospec.h> struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ static DEFINE_RWLOCK(mrt_lock); /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved entries is changed only in process context and protected with weak lock mrt_lock. Queue of unresolved entries is protected with strong spinlock mfc_unres_lock. In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv6.mr6_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv6.mr6_tables) return NULL; return ret; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { int err; struct ip6mr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(flp6)); err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) { return 1; } static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = { FRA_GENERIC_POLICY, }; static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), .action = ip6mr_rule_action, .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .policy = ip6mr_rule_policy, .owner = THIS_MODULE, }; static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0); if (err < 0) goto err2; net->ipv6.mr6_rules_ops = ops; return 0; err2: rtnl_lock(); ip6mr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; rtnl_lock(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt); } fib_rules_unregister(net->ipv6.mr6_rules_ops); rtnl_unlock(); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } bool ip6mr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && rule->table == RT6_TABLE_DFLT && !rule->l3mdev; } EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv6.mrt6; return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; } static int __net_init ip6mr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv6.mrt6 = mrt; return 0; } static void __net_exit ip6mr_rules_exit(struct net *net) { rtnl_lock(); ip6mr_free_table(net->ipv6.mrt6); net->ipv6.mrt6 = NULL; rtnl_unlock(); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ip6mr_rules_seq_read(struct net *net) { return 0; } #endif static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc6_cache_cmp_arg *cmparg = arg->key; struct mfc6_cache *c = (struct mfc6_cache *)ptr; return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); } static const struct rhashtable_params ip6mr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ip6mr_hash_cmp, .automatic_shrinking = true, }; static void ip6mr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); #endif } static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { .mf6c_origin = IN6ADDR_ANY_INIT, .mf6c_mcastgrp = IN6ADDR_ANY_INIT, }; static struct mr_table_ops ip6mr_mr_table_ops = { .rht_params = &ip6mr_rht_params, .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, }; static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; mrt = ip6mr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ip6mr_mr_table_ops, ipmr_expire_process, ip6mr_new_table_set); } static void ip6mr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing * /proc/ip6_mr_cache /proc/ip6_mr_vif */ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(mrt_lock) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); iter->mrt = mrt; read_lock(&mrt_lock); return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) __releases(mrt_lock) { read_unlock(&mrt_lock); } static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { const struct vif_device *vif = v; const char *name = vif->dev ? vif->dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); } return 0; } static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group " "Origin " "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", mfc->_c.mfc_un.res.pkt, mfc->_c.mfc_un.res.bytes, mfc->_c.mfc_un.res.wrong_if); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IPV6_PIMSM_V2 static int pim6_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int reg_vif_num; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) || (pim->flags & PIM_NULL_REGISTER) || (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, sizeof(*pim), IPPROTO_PIM, csum_partial((void *)pim, sizeof(*pim), 0)) && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; /* check if the inner packet is destined to mcast group */ encap = (struct ipv6hdr *)(skb_transport_header(skb) + sizeof(*pim)); if (!ipv6_addr_is_multicast(&encap->daddr) || encap->payload_len == 0 || ntohs(encap->payload_len) + sizeof(*pim) > skb->len) goto drop; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto drop; reg_vif_num = mrt->mroute_reg_vif_num; read_lock(&mrt_lock); if (reg_vif_num >= 0) reg_dev = mrt->vif_table[reg_vif_num].dev; dev_hold(reg_dev); read_unlock(&mrt_lock); if (!reg_dev) goto drop; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); dev_put(reg_dev); return 0; drop: kfree_skb(skb); return 0; } static const struct inet6_protocol pim6_protocol = { .handler = pim6_rcv, }; /* Service routines creating virtual interfaces: PIMREG */ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; if (!pskb_inet_may_pull(skb)) goto tx_err; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); return NETDEV_TX_OK; tx_err: dev->stats.tx_errors++; kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = &reg_vif_netdev_ops; dev->needs_free_netdev = true; dev->features |= NETIF_F_NETNS_LOCAL; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT6_TABLE_DFLT) sprintf(name, "pim6reg"); else sprintf(name, "pim6reg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } #endif static int call_ip6mr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, mifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, vif, vif_index, tb_id, &net->ipv6.ipmr_seq); } static int call_ip6mr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc6_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, &mfc->_c, tb_id, &net->ipv6.ipmr_seq); } /* Delete a VIF entry */ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; if (VIF_EXISTS(mrt, vifi)) call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_VIF_DEL, v, vifi, mrt->id); write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; if (!dev) { write_unlock_bh(&mrt_lock); return -EADDRNOTAVAIL; } #ifdef CONFIG_IPV6_PIMSM_V2 if (vifi == mrt->mroute_reg_vif_num) mrt->mroute_reg_vif_num = -1; #endif if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } mrt->maxvif = tmp + 1; } write_unlock_bh(&mrt_lock); dev_set_allmulti(dev, -1); in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } if ((v->flags & MIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); dev_put(dev); return 0; } static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); } static inline void ip6mr_cache_free(struct mfc6_cache *c) { call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); } ip6mr_cache_free(c); } /* Timer process for all the unresolved queue. */ static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; struct mr_mfc *c, *next; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under write locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXMIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } cache->mfc_un.res.lastuse = jiffies; } static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { #ifdef CONFIG_IPV6_PIMSM_V2 case MIFF_REGISTER: /* * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ip6mr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; #endif case 0: dev = dev_get_by_index(net, vifc->mif6c_pifi); if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), MIFF_REGISTER); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); v->dev = dev; #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) mrt->mroute_reg_vif_num = vifi; #endif if (vifi + 1 > mrt->maxvif) mrt->maxvif = vifi + 1; write_unlock_bh(&mrt_lock); call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); return 0; } static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = in6addr_any, .mf6c_mcastgrp = *mcastgrp, }; if (ipv6_addr_any(mcastgrp)) return mr_mfc_find_any_parent(mrt, mifi); return mr_mfc_find_any(mrt, mifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc6_cache * ip6mr_cache_find_parent(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp, int parent) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXMIFS; c->_c.free = ip6mr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } /* * A cache entry has gone into a resolved state from queued */ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; /* * Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else ip6_mr_forward(net, mrt, skb->dev, skb, c); } } /* * Bounce a cache query up to pim6sd and netlink. * * Called under mrt_lock. */ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT) skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) +sizeof(*msg)); else #endif skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); if (!skb) return -ENOBUFS; /* I suppose that internal messages * do not require checksums */ skb->ip_summed = CHECKSUM_UNNECESSARY; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT) { /* Ugly, but we have no choice with this interface. Duplicate old header, fix length etc. And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ skb_push(skb, -skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = MRT6MSG_WHOLEPKT; msg->im6_mif = mrt->mroute_reg_vif_num; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else #endif { /* * Copy the IP header */ skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); /* * Add our header */ skb_put(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; } rcu_read_lock(); mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { rcu_read_unlock(); kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); /* Deliver to user space multicast routing algorithms */ ret = sock_queue_rcv_skb(mroute6_sk, skb); rcu_read_unlock(); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; break; } } if (!found) { /* * Create a new entry if allowable */ c = ip6mr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; /* * Reflect first query at pim6sd */ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ip6mr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* * MFC6 cache manipulation by user space */ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { struct mfc6_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); list_del_rcu(&c->_c.list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (v->dev == dev) mif6_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static unsigned int ip6mr_seq_read(struct net *net) { ASSERT_RTNL(); return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, ip6mr_mr_table_iter, &mrt_lock, extack); } static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; static const struct fib_notifier_ops ip6mr_notifier_ops_template = { .family = RTNL_FAMILY_IP6MR, .fib_seq_read = ip6mr_seq_read, .fib_dump = ip6mr_dump, .owner = THIS_MODULE, }; static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv6.ipmr_seq = 0; ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.ip6mr_notifier_ops = ops; return 0; } static void __net_exit ip6mr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); net->ipv6.ip6mr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; err = ip6mr_notifier_init(net); if (err) return err; err = ip6mr_rules_init(net); if (err < 0) goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: ip6mr_rules_exit(net); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); return err; } static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_rules_exit(net); ip6mr_notifier_exit(net); } static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, }; int __init ip6_mr_init(void) { int err; mrt_cachep = kmem_cache_create("ip6_mrt_cache", sizeof(struct mfc6_cache), 0, SLAB_HWCACHE_ALIGN, NULL); if (!mrt_cachep) return -ENOMEM; err = register_pernet_subsys(&ip6mr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip6_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IPV6_PIMSM_V2 if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, ip6mr_rtm_dumproute, 0); if (err == 0) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ip6mr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } void ip6_mr_cleanup(void) { rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); } static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { unsigned char ttls[MAXMIFS]; struct mfc6_cache *uc, *c; struct mr_mfc *_uc; bool found; int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; memset(ttls, 255, MAXMIFS); for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; } /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (c) { write_lock_bh(&mrt_lock); c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; c = ip6mr_cache_alloc(); if (!c) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* * Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT6_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } } } static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); write_lock_bh(&mrt_lock); if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } write_unlock_bh(&mrt_lock); if (!err) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); rtnl_unlock(); return err; } int ip6mr_sk_done(struct sock *sk) { int err = -EACCES; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return err; rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { write_lock_bh(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); /* Note that mroute_sk had SOCK_RCU_FREE set, * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ atomic_dec(&net->ipv6.devconf_all->mc_forwarding); write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); err = 0; break; } } rtnl_unlock(); return err; } bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; return rcu_access_pointer(mrt->mroute_sk); } EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; if (optname != MRT6_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT6_INIT: if (optlen < sizeof(int)) return -EINVAL; return ip6mr_sk_init(mrt, sk); case MRT6_DONE: return ip6mr_sk_done(sk); case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); ret = mif6_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); rtnl_unlock(); return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; fallthrough; case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; rtnl_lock(); if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); rtnl_unlock(); return ret; case MRT6_FLUSH: { int flags; if (optlen != sizeof(flags)) return -EINVAL; if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); rtnl_unlock(); return 0; } /* * Control PIM assert (to activate pim will activate assert) */ case MRT6_ASSERT: { int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; } #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: { int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; v = !!v; rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { mrt->mroute_do_pim = v; mrt->mroute_do_assert = v; } rtnl_unlock(); return ret; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES case MRT6_TABLE: { u32 v; if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); ret = 0; mrt = ip6mr_new_table(net, v); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw6_sk(sk)->ip6mr_table = v; rtnl_unlock(); return ret; } #endif /* * Spurious command, or MRT6_VERSION which you cannot * set. */ default: return -ENOPROTOOPT; } } /* * Getsock opt support for the multicast routing system. */ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (optname) { case MRT6_VERSION: val = 0x0305; break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: val = mrt->mroute_do_pim; break; #endif case MRT6_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (get_user(olr, optlen)) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; if (put_user(olr, optlen)) return -EFAULT; if (copy_to_user(optval, &val, olr)) return -EFAULT; return 0; } /* * The IP multicast ioctl support routines. */ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) { struct sioc_sg_req6 sr; struct sioc_mif_req6 vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); read_lock(&mrt_lock); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = vif->pkt_in; vr.ocount = vif->pkt_out; vr.ibytes = vif->bytes_in; vr.obytes = vif->bytes_out; read_unlock(&mrt_lock); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } read_unlock(&mrt_lock); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = c->_c.mfc_un.res.pkt; sr.bytecnt = c->_c.mfc_un.res.bytes; sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req6 { struct sockaddr_in6 src; struct sockaddr_in6 grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_mif_req6 { mifi_t mifi; compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); read_lock(&mrt_lock); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = vif->pkt_in; vr.ocount = vif->pkt_out; vr.ibytes = vif->bytes_in; vr.obytes = vif->bytes_out; read_unlock(&mrt_lock); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } read_unlock(&mrt_lock); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = c->_c.mfc_un.res.pkt; sr.bytecnt = c->_c.mfc_un.res.bytes; sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); IP6_ADD_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTOCTETS, skb->len); return dst_output(net, sk, skb); } /* * Processing handlers for ip6mr_forward */ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct ipv6hdr *ipv6h; struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; if (!vif->dev) goto out_free; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { vif->pkt_out++; vif->bytes_out += skb->len; vif->dev->stats.tx_bytes += skb->len; vif->dev->stats.tx_packets++; ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); goto out_free; } #endif ipv6h = ipv6_hdr(skb); fl6 = (struct flowi6) { .flowi6_oif = vif->link, .daddr = ipv6h->daddr, }; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); goto out_free; } skb_dst_drop(skb); skb_dst_set(skb, dst); /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ dev = vif->dev; skb->dev = dev; vif->pkt_out++; vif->bytes_out += skb->len; /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev))) goto out_free; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; IP6CB(skb)->flags |= IP6SKB_FORWARDED; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, dev, ip6mr_forward2_finish); out_free: kfree_skb(skb); return 0; } static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; for (ct = mrt->maxvif - 1; ct >= 0; ct--) { if (mrt->vif_table[ct].dev == dev) break; } return ct; } static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; c->_c.mfc_un.res.bytes += skb->len; c->_c.mfc_un.res.lastuse = jiffies; if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ rcu_read_lock(); cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) { rcu_read_unlock(); goto forward; } rcu_read_unlock(); } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (mrt->vif_table[vif].dev != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); } goto dont_forward; } forward: mrt->vif_table[vif].pkt_in++; mrt->vif_table[vif].bytes_in += skb->len; /* * Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* * Multicast packets for forwarding arrive here */ int ip6_mr_input(struct sk_buff *skb) { struct mfc6_cache *cache; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int err; struct net_device *dev; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ dev = skb->dev; if (netif_is_l3_master(skb->dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } read_lock(&mrt_lock); cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* * No usable cache entry */ if (!cache) { int vif; vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); read_unlock(&mrt_lock); return err; } read_unlock(&mrt_lock); kfree_skb(skb); return -ENODEV; } ip6_mr_forward(net, mrt, dev, skb, cache); read_unlock(&mrt_lock); return 0; } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return -ENOENT; read_lock(&mrt_lock); cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, vif); } if (!cache) { struct sk_buff *skb2; struct ipv6hdr *iph; struct net_device *dev; int vif; dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { read_unlock(&mrt_lock); return -ENODEV; } /* really correct? */ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb2) { read_unlock(&mrt_lock); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); skb_reset_network_header(skb2); iph = ipv6_hdr(skb2); iph->version = 0; iph->priority = 0; iph->flow_lbl[0] = 0; iph->flow_lbl[1] = 0; iph->flow_lbl[2] = 0; iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); read_unlock(&mrt_lock); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); read_unlock(&mrt_lock); return err; } static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IP6MR; rtm->rtm_dst_len = 128; rtm->rtm_src_len = 128; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, cmd, flags); } static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IP6MRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IP6MRA_CREPORT_MIF_ID */ /* IP6MRA_CREPORT_SRC_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_DST_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct mrt6msg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct mrt6msg); msg = (struct mrt6msg *)skb_transport_header(pkt); skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IP6MR; if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) || nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) || nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR, &msg->im6_src) || nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR, &msg->im6_dst)) goto nla_put_failure; nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS); } static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct fib_dump_filter filter = {}; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/mrp_bridge.h> #include "br_private_mrp.h" static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 }; static const u8 mrp_in_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x3 }; static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); static struct br_frame_type mrp_frame_type __read_mostly = { .type = cpu_to_be16(ETH_P_MRP), .frame_handler = br_mrp_process, }; static bool br_mrp_is_ring_port(struct net_bridge_port *p_port, struct net_bridge_port *s_port, struct net_bridge_port *port) { if (port == p_port || port == s_port) return true; return false; } static bool br_mrp_is_in_port(struct net_bridge_port *i_port, struct net_bridge_port *port) { if (port == i_port) return true; return false; } static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br, u32 ifindex) { struct net_bridge_port *res = NULL; struct net_bridge_port *port; list_for_each_entry(port, &br->port_list, list) { if (port->dev->ifindex == ifindex) { res = port; break; } } return res; } static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) { struct br_mrp *res = NULL; struct br_mrp *mrp; hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, lockdep_rtnl_is_held()) { if (mrp->ring_id == ring_id) { res = mrp; break; } } return res; } static struct br_mrp *br_mrp_find_in_id(struct net_bridge *br, u32 in_id) { struct br_mrp *res = NULL; struct br_mrp *mrp; hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, lockdep_rtnl_is_held()) { if (mrp->in_id == in_id) { res = mrp; break; } } return res; } static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) { struct br_mrp *mrp; hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, lockdep_rtnl_is_held()) { struct net_bridge_port *p; p = rtnl_dereference(mrp->p_port); if (p && p->dev->ifindex == ifindex) return false; p = rtnl_dereference(mrp->s_port); if (p && p->dev->ifindex == ifindex) return false; p = rtnl_dereference(mrp->i_port); if (p && p->dev->ifindex == ifindex) return false; } return true; } static struct br_mrp *br_mrp_find_port(struct net_bridge *br, struct net_bridge_port *p) { struct br_mrp *res = NULL; struct br_mrp *mrp; hlist_for_each_entry_rcu(mrp, &br->mrp_list, list, lockdep_rtnl_is_held()) { if (rcu_access_pointer(mrp->p_port) == p || rcu_access_pointer(mrp->s_port) == p || rcu_access_pointer(mrp->i_port) == p) { res = mrp; break; } } return res; } static int br_mrp_next_seq(struct br_mrp *mrp) { mrp->seq_id++; return mrp->seq_id; } static struct sk_buff *br_mrp_skb_alloc(struct net_bridge_port *p, const u8 *src, const u8 *dst) { struct ethhdr *eth_hdr; struct sk_buff *skb; __be16 *version; skb = dev_alloc_skb(MRP_MAX_FRAME_LENGTH); if (!skb) return NULL; skb->dev = p->dev; skb->protocol = htons(ETH_P_MRP); skb->priority = MRP_FRAME_PRIO; skb_reserve(skb, sizeof(*eth_hdr)); eth_hdr = skb_push(skb, sizeof(*eth_hdr)); ether_addr_copy(eth_hdr->h_dest, dst); ether_addr_copy(eth_hdr->h_source, src); eth_hdr->h_proto = htons(ETH_P_MRP); version = skb_put(skb, sizeof(*version)); *version = cpu_to_be16(MRP_VERSION); return skb; } static void br_mrp_skb_tlv(struct sk_buff *skb, enum br_mrp_tlv_header_type type, u8 length) { struct br_mrp_tlv_hdr *hdr; hdr = skb_put(skb, sizeof(*hdr)); hdr->type = type; hdr->length = length; } static void br_mrp_skb_common(struct sk_buff *skb, struct br_mrp *mrp) { struct br_mrp_common_hdr *hdr; br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_COMMON, sizeof(*hdr)); hdr = skb_put(skb, sizeof(*hdr)); hdr->seq_id = cpu_to_be16(br_mrp_next_seq(mrp)); memset(hdr->domain, 0xff, MRP_DOMAIN_UUID_LENGTH); } static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, struct net_bridge_port *p, enum br_mrp_port_role_type port_role) { struct br_mrp_ring_test_hdr *hdr = NULL; struct sk_buff *skb = NULL; if (!p) return NULL; skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_test_dmac); if (!skb) return NULL; br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr)); hdr = skb_put(skb, sizeof(*hdr)); hdr->prio = cpu_to_be16(mrp->prio); ether_addr_copy(hdr->sa, p->br->dev->dev_addr); hdr->port_role = cpu_to_be16(port_role); hdr->state = cpu_to_be16(mrp->ring_state); hdr->transitions = cpu_to_be16(mrp->ring_transitions); hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); br_mrp_skb_common(skb, mrp); /* In case the node behaves as MRA then the Test frame needs to have * an Option TLV which includes eventually a sub-option TLV that has * the type AUTO_MGR */ if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { struct br_mrp_sub_option1_hdr *sub_opt = NULL; struct br_mrp_tlv_hdr *sub_tlv = NULL; struct br_mrp_oui_hdr *oui = NULL; u8 length; length = sizeof(*sub_opt) + sizeof(*sub_tlv) + sizeof(oui) + MRP_OPT_PADDING; br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_OPTION, length); oui = skb_put(skb, sizeof(*oui)); memset(oui, 0x0, sizeof(*oui)); sub_opt = skb_put(skb, sizeof(*sub_opt)); memset(sub_opt, 0x0, sizeof(*sub_opt)); sub_tlv = skb_put(skb, sizeof(*sub_tlv)); sub_tlv->type = BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR; /* 32 bit alligment shall be ensured therefore add 2 bytes */ skb_put(skb, MRP_OPT_PADDING); } br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); return skb; } static struct sk_buff *br_mrp_alloc_in_test_skb(struct br_mrp *mrp, struct net_bridge_port *p, enum br_mrp_port_role_type port_role) { struct br_mrp_in_test_hdr *hdr = NULL; struct sk_buff *skb = NULL; if (!p) return NULL; skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_in_test_dmac); if (!skb) return NULL; br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_IN_TEST, sizeof(*hdr)); hdr = skb_put(skb, sizeof(*hdr)); hdr->id = cpu_to_be16(mrp->in_id); ether_addr_copy(hdr->sa, p->br->dev->dev_addr); hdr->port_role = cpu_to_be16(port_role); hdr->state = cpu_to_be16(mrp->in_state); hdr->transitions = cpu_to_be16(mrp->in_transitions); hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); br_mrp_skb_common(skb, mrp); br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); return skb; } /* This function is continuously called in the following cases: * - when node role is MRM, in this case test_monitor is always set to false * because it needs to notify the userspace that the ring is open and needs to * send MRP_Test frames * - when node role is MRA, there are 2 subcases: * - when MRA behaves as MRM, in this case is similar with MRM role * - when MRA behaves as MRC, in this case test_monitor is set to true, * because it needs to detect when it stops seeing MRP_Test frames * from MRM node but it doesn't need to send MRP_Test frames. */ static void br_mrp_test_work_expired(struct work_struct *work) { struct delayed_work *del_work = to_delayed_work(work); struct br_mrp *mrp = container_of(del_work, struct br_mrp, test_work); struct net_bridge_port *p; bool notify_open = false; struct sk_buff *skb; if (time_before_eq(mrp->test_end, jiffies)) return; if (mrp->test_count_miss < mrp->test_max_miss) { mrp->test_count_miss++; } else { /* Notify that the ring is open only if the ring state is * closed, otherwise it would continue to notify at every * interval. * Also notify that the ring is open when the node has the * role MRA and behaves as MRC. The reason is that the * userspace needs to know when the MRM stopped sending * MRP_Test frames so that the current node to try to take * the role of a MRM. */ if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED || mrp->test_monitor) notify_open = true; } rcu_read_lock(); p = rcu_dereference(mrp->p_port); if (p) { if (!mrp->test_monitor) { skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY); if (!skb) goto out; skb_reset_network_header(skb); dev_queue_xmit(skb); } if (notify_open && !mrp->ring_role_offloaded) br_mrp_ring_port_open(p->dev, true); } p = rcu_dereference(mrp->s_port); if (p) { if (!mrp->test_monitor) { skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY); if (!skb) goto out; skb_reset_network_header(skb); dev_queue_xmit(skb); } if (notify_open && !mrp->ring_role_offloaded) br_mrp_ring_port_open(p->dev, true); } out: rcu_read_unlock(); queue_delayed_work(system_wq, &mrp->test_work, usecs_to_jiffies(mrp->test_interval)); } /* This function is continuously called when the node has the interconnect role * MIM. It would generate interconnect test frames and will send them on all 3 * ports. But will also check if it stop receiving interconnect test frames. */ static void br_mrp_in_test_work_expired(struct work_struct *work) { struct delayed_work *del_work = to_delayed_work(work); struct br_mrp *mrp = container_of(del_work, struct br_mrp, in_test_work); struct net_bridge_port *p; bool notify_open = false; struct sk_buff *skb; if (time_before_eq(mrp->in_test_end, jiffies)) return; if (mrp->in_test_count_miss < mrp->in_test_max_miss) { mrp->in_test_count_miss++; } else { /* Notify that the interconnect ring is open only if the * interconnect ring state is closed, otherwise it would * continue to notify at every interval. */ if (mrp->in_state == BR_MRP_IN_STATE_CLOSED) notify_open = true; } rcu_read_lock(); p = rcu_dereference(mrp->p_port); if (p) { skb = br_mrp_alloc_in_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY); if (!skb) goto out; skb_reset_network_header(skb); dev_queue_xmit(skb); if (notify_open && !mrp->in_role_offloaded) br_mrp_in_port_open(p->dev, true); } p = rcu_dereference(mrp->s_port); if (p) { skb = br_mrp_alloc_in_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY); if (!skb) goto out; skb_reset_network_header(skb); dev_queue_xmit(skb); if (notify_open && !mrp->in_role_offloaded) br_mrp_in_port_open(p->dev, true); } p = rcu_dereference(mrp->i_port); if (p) { skb = br_mrp_alloc_in_test_skb(mrp, p, BR_MRP_PORT_ROLE_INTER); if (!skb) goto out; skb_reset_network_header(skb); dev_queue_xmit(skb); if (notify_open && !mrp->in_role_offloaded) br_mrp_in_port_open(p->dev, true); } out: rcu_read_unlock(); queue_delayed_work(system_wq, &mrp->in_test_work, usecs_to_jiffies(mrp->in_test_interval)); } /* Deletes the MRP instance. * note: called under rtnl_lock */ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) { struct net_bridge_port *p; u8 state; /* Stop sending MRP_Test frames */ cancel_delayed_work_sync(&mrp->test_work); br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0); /* Stop sending MRP_InTest frames if has an interconnect role */ cancel_delayed_work_sync(&mrp->in_test_work); br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0); /* Disable the roles */ br_mrp_switchdev_set_ring_role(br, mrp, BR_MRP_RING_ROLE_DISABLED); p = rtnl_dereference(mrp->i_port); if (p) br_mrp_switchdev_set_in_role(br, mrp, mrp->in_id, mrp->ring_id, BR_MRP_IN_ROLE_DISABLED); br_mrp_switchdev_del(br, mrp); /* Reset the ports */ p = rtnl_dereference(mrp->p_port); if (p) { spin_lock_bh(&br->lock); state = netif_running(br->dev) ? BR_STATE_FORWARDING : BR_STATE_DISABLED; p->state = state; p->flags &= ~BR_MRP_AWARE; spin_unlock_bh(&br->lock); br_mrp_port_switchdev_set_state(p, state); rcu_assign_pointer(mrp->p_port, NULL); } p = rtnl_dereference(mrp->s_port); if (p) { spin_lock_bh(&br->lock); state = netif_running(br->dev) ? BR_STATE_FORWARDING : BR_STATE_DISABLED; p->state = state; p->flags &= ~BR_MRP_AWARE; spin_unlock_bh(&br->lock); br_mrp_port_switchdev_set_state(p, state); rcu_assign_pointer(mrp->s_port, NULL); } p = rtnl_dereference(mrp->i_port); if (p) { spin_lock_bh(&br->lock); state = netif_running(br->dev) ? BR_STATE_FORWARDING : BR_STATE_DISABLED; p->state = state; p->flags &= ~BR_MRP_AWARE; spin_unlock_bh(&br->lock); br_mrp_port_switchdev_set_state(p, state); rcu_assign_pointer(mrp->i_port, NULL); } hlist_del_rcu(&mrp->list); kfree_rcu(mrp, rcu); if (hlist_empty(&br->mrp_list)) br_del_frame(br, &mrp_frame_type); } /* Adds a new MRP instance. * note: called under rtnl_lock */ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) { struct net_bridge_port *p; struct br_mrp *mrp; int err; /* If the ring exists, it is not possible to create another one with the * same ring_id */ mrp = br_mrp_find_id(br, instance->ring_id); if (mrp) return -EINVAL; if (!br_mrp_get_port(br, instance->p_ifindex) || !br_mrp_get_port(br, instance->s_ifindex)) return -EINVAL; /* It is not possible to have the same port part of multiple rings */ if (!br_mrp_unique_ifindex(br, instance->p_ifindex) || !br_mrp_unique_ifindex(br, instance->s_ifindex)) return -EINVAL; mrp = kzalloc(sizeof(*mrp), GFP_KERNEL); if (!mrp) return -ENOMEM; mrp->ring_id = instance->ring_id; mrp->prio = instance->prio; p = br_mrp_get_port(br, instance->p_ifindex); spin_lock_bh(&br->lock); p->state = BR_STATE_FORWARDING; p->flags |= BR_MRP_AWARE; spin_unlock_bh(&br->lock); rcu_assign_pointer(mrp->p_port, p); p = br_mrp_get_port(br, instance->s_ifindex); spin_lock_bh(&br->lock); p->state = BR_STATE_FORWARDING; p->flags |= BR_MRP_AWARE; spin_unlock_bh(&br->lock); rcu_assign_pointer(mrp->s_port, p); if (hlist_empty(&br->mrp_list)) br_add_frame(br, &mrp_frame_type); INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired); INIT_DELAYED_WORK(&mrp->in_test_work, br_mrp_in_test_work_expired); hlist_add_tail_rcu(&mrp->list, &br->mrp_list); err = br_mrp_switchdev_add(br, mrp); if (err) goto delete_mrp; return 0; delete_mrp: br_mrp_del_impl(br, mrp); return err; } /* Deletes the MRP instance from which the port is part of * note: called under rtnl_lock */ void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p) { struct br_mrp *mrp = br_mrp_find_port(br, p); /* If the port is not part of a MRP instance just bail out */ if (!mrp) return; br_mrp_del_impl(br, mrp); } /* Deletes existing MRP instance based on ring_id * note: called under rtnl_lock */ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance) { struct br_mrp *mrp = br_mrp_find_id(br, instance->ring_id); if (!mrp) return -EINVAL; br_mrp_del_impl(br, mrp); return 0; } /* Set port state, port state can be forwarding, blocked or disabled * note: already called with rtnl_lock */ int br_mrp_set_port_state(struct net_bridge_port *p, enum br_mrp_port_state_type state) { u32 port_state; if (!p || !(p->flags & BR_MRP_AWARE)) return -EINVAL; spin_lock_bh(&p->br->lock); if (state == BR_MRP_PORT_STATE_FORWARDING) port_state = BR_STATE_FORWARDING; else port_state = BR_STATE_BLOCKING; p->state = port_state; spin_unlock_bh(&p->br->lock); br_mrp_port_switchdev_set_state(p, port_state); return 0; } /* Set port role, port role can be primary or secondary * note: already called with rtnl_lock */ int br_mrp_set_port_role(struct net_bridge_port *p, enum br_mrp_port_role_type role) { struct br_mrp *mrp; if (!p || !(p->flags & BR_MRP_AWARE)) return -EINVAL; mrp = br_mrp_find_port(p->br, p); if (!mrp) return -EINVAL; switch (role) { case BR_MRP_PORT_ROLE_PRIMARY: rcu_assign_pointer(mrp->p_port, p); break; case BR_MRP_PORT_ROLE_SECONDARY: rcu_assign_pointer(mrp->s_port, p); break; default: return -EINVAL; } br_mrp_port_switchdev_set_role(p, role); return 0; } /* Set ring state, ring state can be only Open or Closed * note: already called with rtnl_lock */ int br_mrp_set_ring_state(struct net_bridge *br, struct br_mrp_ring_state *state) { struct br_mrp *mrp = br_mrp_find_id(br, state->ring_id); if (!mrp) return -EINVAL; if (mrp->ring_state != state->ring_state) mrp->ring_transitions++; mrp->ring_state = state->ring_state; br_mrp_switchdev_set_ring_state(br, mrp, state->ring_state); return 0; } /* Set ring role, ring role can be only MRM(Media Redundancy Manager) or * MRC(Media Redundancy Client). * note: already called with rtnl_lock */ int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role) { struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); enum br_mrp_hw_support support; if (!mrp) return -EINVAL; mrp->ring_role = role->ring_role; /* If there is an error just bailed out */ support = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role); if (support == BR_MRP_NONE) return -EOPNOTSUPP; /* Now detect if the HW actually applied the role or not. If the HW * applied the role it means that the SW will not to do those operations * anymore. For example if the role ir MRM then the HW will notify the * SW when ring is open, but if the is not pushed to the HW the SW will * need to detect when the ring is open */ mrp->ring_role_offloaded = support == BR_MRP_SW ? 0 : 1; return 0; } /* Start to generate or monitor MRP test frames, the frames are generated by * HW and if it fails, they are generated by the SW. * note: already called with rtnl_lock */ int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test) { struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id); enum br_mrp_hw_support support; if (!mrp) return -EINVAL; /* Try to push it to the HW and if it fails then continue with SW * implementation and if that also fails then return error. */ support = br_mrp_switchdev_send_ring_test(br, mrp, test->interval, test->max_miss, test->period, test->monitor); if (support == BR_MRP_NONE) return -EOPNOTSUPP; if (support == BR_MRP_HW) return 0; mrp->test_interval = test->interval; mrp->test_end = jiffies + usecs_to_jiffies(test->period); mrp->test_max_miss = test->max_miss; mrp->test_monitor = test->monitor; mrp->test_count_miss = 0; queue_delayed_work(system_wq, &mrp->test_work, usecs_to_jiffies(test->interval)); return 0; } /* Set in state, int state can be only Open or Closed * note: already called with rtnl_lock */ int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state) { struct br_mrp *mrp = br_mrp_find_in_id(br, state->in_id); if (!mrp) return -EINVAL; if (mrp->in_state != state->in_state) mrp->in_transitions++; mrp->in_state = state->in_state; br_mrp_switchdev_set_in_state(br, mrp, state->in_state); return 0; } /* Set in role, in role can be only MIM(Media Interconnection Manager) or * MIC(Media Interconnection Client). * note: already called with rtnl_lock */ int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role) { struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); enum br_mrp_hw_support support; struct net_bridge_port *p; if (!mrp) return -EINVAL; if (!br_mrp_get_port(br, role->i_ifindex)) return -EINVAL; if (role->in_role == BR_MRP_IN_ROLE_DISABLED) { u8 state; /* It is not allowed to disable a port that doesn't exist */ p = rtnl_dereference(mrp->i_port); if (!p) return -EINVAL; /* Stop the generating MRP_InTest frames */ cancel_delayed_work_sync(&mrp->in_test_work); br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0); /* Remove the port */ spin_lock_bh(&br->lock); state = netif_running(br->dev) ? BR_STATE_FORWARDING : BR_STATE_DISABLED; p->state = state; p->flags &= ~BR_MRP_AWARE; spin_unlock_bh(&br->lock); br_mrp_port_switchdev_set_state(p, state); rcu_assign_pointer(mrp->i_port, NULL); mrp->in_role = role->in_role; mrp->in_id = 0; return 0; } /* It is not possible to have the same port part of multiple rings */ if (!br_mrp_unique_ifindex(br, role->i_ifindex)) return -EINVAL; /* It is not allowed to set a different interconnect port if the mrp * instance has already one. First it needs to be disabled and after * that set the new port */ if (rcu_access_pointer(mrp->i_port)) return -EINVAL; p = br_mrp_get_port(br, role->i_ifindex); spin_lock_bh(&br->lock); p->state = BR_STATE_FORWARDING; p->flags |= BR_MRP_AWARE; spin_unlock_bh(&br->lock); rcu_assign_pointer(mrp->i_port, p); mrp->in_role = role->in_role; mrp->in_id = role->in_id; /* If there is an error just bailed out */ support = br_mrp_switchdev_set_in_role(br, mrp, role->in_id, role->ring_id, role->in_role); if (support == BR_MRP_NONE) return -EOPNOTSUPP; /* Now detect if the HW actually applied the role or not. If the HW * applied the role it means that the SW will not to do those operations * anymore. For example if the role is MIM then the HW will notify the * SW when interconnect ring is open, but if the is not pushed to the HW * the SW will need to detect when the interconnect ring is open. */ mrp->in_role_offloaded = support == BR_MRP_SW ? 0 : 1; return 0; } /* Start to generate MRP_InTest frames, the frames are generated by * HW and if it fails, they are generated by the SW. * note: already called with rtnl_lock */ int br_mrp_start_in_test(struct net_bridge *br, struct br_mrp_start_in_test *in_test) { struct br_mrp *mrp = br_mrp_find_in_id(br, in_test->in_id); enum br_mrp_hw_support support; if (!mrp) return -EINVAL; if (mrp->in_role != BR_MRP_IN_ROLE_MIM) return -EINVAL; /* Try to push it to the HW and if it fails then continue with SW * implementation and if that also fails then return error. */ support = br_mrp_switchdev_send_in_test(br, mrp, in_test->interval, in_test->max_miss, in_test->period); if (support == BR_MRP_NONE) return -EOPNOTSUPP; if (support == BR_MRP_HW) return 0; mrp->in_test_interval = in_test->interval; mrp->in_test_end = jiffies + usecs_to_jiffies(in_test->period); mrp->in_test_max_miss = in_test->max_miss; mrp->in_test_count_miss = 0; queue_delayed_work(system_wq, &mrp->in_test_work, usecs_to_jiffies(in_test->interval)); return 0; } /* Determine if the frame type is a ring frame */ static bool br_mrp_ring_frame(struct sk_buff *skb) { const struct br_mrp_tlv_hdr *hdr; struct br_mrp_tlv_hdr _hdr; hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); if (!hdr) return false; if (hdr->type == BR_MRP_TLV_HEADER_RING_TEST || hdr->type == BR_MRP_TLV_HEADER_RING_TOPO || hdr->type == BR_MRP_TLV_HEADER_RING_LINK_DOWN || hdr->type == BR_MRP_TLV_HEADER_RING_LINK_UP || hdr->type == BR_MRP_TLV_HEADER_OPTION) return true; return false; } /* Determine if the frame type is an interconnect frame */ static bool br_mrp_in_frame(struct sk_buff *skb) { const struct br_mrp_tlv_hdr *hdr; struct br_mrp_tlv_hdr _hdr; hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); if (!hdr) return false; if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST || hdr->type == BR_MRP_TLV_HEADER_IN_TOPO || hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN || hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP || hdr->type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) return true; return false; } /* Process only MRP Test frame. All the other MRP frames are processed by * userspace application * note: already called with rcu_read_lock */ static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, struct sk_buff *skb) { const struct br_mrp_tlv_hdr *hdr; struct br_mrp_tlv_hdr _hdr; /* Each MRP header starts with a version field which is 16 bits. * Therefore skip the version and get directly the TLV header. */ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); if (!hdr) return; if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) return; mrp->test_count_miss = 0; /* Notify the userspace that the ring is closed only when the ring is * not closed */ if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED) br_mrp_ring_port_open(port->dev, false); } /* Determine if the test hdr has a better priority than the node */ static bool br_mrp_test_better_than_own(struct br_mrp *mrp, struct net_bridge *br, const struct br_mrp_ring_test_hdr *hdr) { u16 prio = be16_to_cpu(hdr->prio); if (prio < mrp->prio || (prio == mrp->prio && ether_addr_to_u64(hdr->sa) < ether_addr_to_u64(br->dev->dev_addr))) return true; return false; } /* Process only MRP Test frame. All the other MRP frames are processed by * userspace application * note: already called with rcu_read_lock */ static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb) { const struct br_mrp_ring_test_hdr *test_hdr; struct br_mrp_ring_test_hdr _test_hdr; const struct br_mrp_tlv_hdr *hdr; struct br_mrp_tlv_hdr _hdr; /* Each MRP header starts with a version field which is 16 bits. * Therefore skip the version and get directly the TLV header. */ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); if (!hdr) return; if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) return; test_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr), sizeof(_test_hdr), &_test_hdr); if (!test_hdr) return; /* Only frames that have a better priority than the node will * clear the miss counter because otherwise the node will need to behave * as MRM. */ if (br_mrp_test_better_than_own(mrp, br, test_hdr)) mrp->test_count_miss = 0; } /* Process only MRP InTest frame. All the other MRP frames are processed by * userspace application * note: already called with rcu_read_lock */ static bool br_mrp_mim_process(struct br_mrp *mrp, struct net_bridge_port *port, struct sk_buff *skb) { const struct br_mrp_in_test_hdr *in_hdr; struct br_mrp_in_test_hdr _in_hdr; const struct br_mrp_tlv_hdr *hdr; struct br_mrp_tlv_hdr _hdr; /* Each MRP header starts with a version field which is 16 bits. * Therefore skip the version and get directly the TLV header. */ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); if (!hdr) return false; /* The check for InTest frame type was already done */ in_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr), sizeof(_in_hdr), &_in_hdr); if (!in_hdr) return false; /* It needs to process only it's own InTest frames. */ if (mrp->in_id != ntohs(in_hdr->id)) return false; mrp->in_test_count_miss = 0; /* Notify the userspace that the ring is closed only when the ring is * not closed */ if (mrp->in_state != BR_MRP_IN_STATE_CLOSED) br_mrp_in_port_open(port->dev, false); return true; } /* Get the MRP frame type * note: already called with rcu_read_lock */ static u8 br_mrp_get_frame_type(struct sk_buff *skb) { const struct br_mrp_tlv_hdr *hdr; struct br_mrp_tlv_hdr _hdr; /* Each MRP header starts with a version field which is 16 bits. * Therefore skip the version and get directly the TLV header. */ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); if (!hdr) return 0xff; return hdr->type; } static bool br_mrp_mrm_behaviour(struct br_mrp *mrp) { if (mrp->ring_role == BR_MRP_RING_ROLE_MRM || (mrp->ring_role == BR_MRP_RING_ROLE_MRA && !mrp->test_monitor)) return true; return false; } static bool br_mrp_mrc_behaviour(struct br_mrp *mrp) { if (mrp->ring_role == BR_MRP_RING_ROLE_MRC || (mrp->ring_role == BR_MRP_RING_ROLE_MRA && mrp->test_monitor)) return true; return false; } /* This will just forward the frame to the other mrp ring ports, depending on * the frame type, ring role and interconnect role * note: already called with rcu_read_lock */ static int br_mrp_rcv(struct net_bridge_port *p, struct sk_buff *skb, struct net_device *dev) { struct net_bridge_port *p_port, *s_port, *i_port = NULL; struct net_bridge_port *p_dst, *s_dst, *i_dst = NULL; struct net_bridge *br; struct br_mrp *mrp; /* If port is disabled don't accept any frames */ if (p->state == BR_STATE_DISABLED) return 0; br = p->br; mrp = br_mrp_find_port(br, p); if (unlikely(!mrp)) return 0; p_port = rcu_dereference(mrp->p_port); if (!p_port) return 0; p_dst = p_port; s_port = rcu_dereference(mrp->s_port); if (!s_port) return 0; s_dst = s_port; /* If the frame is a ring frame then it is not required to check the * interconnect role and ports to process or forward the frame */ if (br_mrp_ring_frame(skb)) { /* If the role is MRM then don't forward the frames */ if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) { br_mrp_mrm_process(mrp, p, skb); goto no_forward; } /* If the role is MRA then don't forward the frames if it * behaves as MRM node */ if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { if (!mrp->test_monitor) { br_mrp_mrm_process(mrp, p, skb); goto no_forward; } br_mrp_mra_process(mrp, br, p, skb); } goto forward; } if (br_mrp_in_frame(skb)) { u8 in_type = br_mrp_get_frame_type(skb); i_port = rcu_dereference(mrp->i_port); i_dst = i_port; /* If the ring port is in block state it should not forward * In_Test frames */ if (br_mrp_is_ring_port(p_port, s_port, p) && p->state == BR_STATE_BLOCKING && in_type == BR_MRP_TLV_HEADER_IN_TEST) goto no_forward; /* Nodes that behaves as MRM needs to stop forwarding the * frames in case the ring is closed, otherwise will be a loop. * In this case the frame is no forward between the ring ports. */ if (br_mrp_mrm_behaviour(mrp) && br_mrp_is_ring_port(p_port, s_port, p) && (s_port->state != BR_STATE_FORWARDING || p_port->state != BR_STATE_FORWARDING)) { p_dst = NULL; s_dst = NULL; } /* A node that behaves as MRC and doesn't have a interconnect * role then it should forward all frames between the ring ports * because it doesn't have an interconnect port */ if (br_mrp_mrc_behaviour(mrp) && mrp->in_role == BR_MRP_IN_ROLE_DISABLED) goto forward; if (mrp->in_role == BR_MRP_IN_ROLE_MIM) { if (in_type == BR_MRP_TLV_HEADER_IN_TEST) { /* MIM should not forward it's own InTest * frames */ if (br_mrp_mim_process(mrp, p, skb)) { goto no_forward; } else { if (br_mrp_is_ring_port(p_port, s_port, p)) i_dst = NULL; if (br_mrp_is_in_port(i_port, p)) goto no_forward; } } else { /* MIM should forward IntLinkChange/Status and * IntTopoChange between ring ports but MIM * should not forward IntLinkChange/Status and * IntTopoChange if the frame was received at * the interconnect port */ if (br_mrp_is_ring_port(p_port, s_port, p)) i_dst = NULL; if (br_mrp_is_in_port(i_port, p)) goto no_forward; } } if (mrp->in_role == BR_MRP_IN_ROLE_MIC) { /* MIC should forward InTest frames on all ports * regardless of the received port */ if (in_type == BR_MRP_TLV_HEADER_IN_TEST) goto forward; /* MIC should forward IntLinkChange frames only if they * are received on ring ports to all the ports */ if (br_mrp_is_ring_port(p_port, s_port, p) && (in_type == BR_MRP_TLV_HEADER_IN_LINK_UP || in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN)) goto forward; /* MIC should forward IntLinkStatus frames only to * interconnect port if it was received on a ring port. * If it is received on interconnect port then, it * should be forward on both ring ports */ if (br_mrp_is_ring_port(p_port, s_port, p) && in_type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) { p_dst = NULL; s_dst = NULL; } /* Should forward the InTopo frames only between the * ring ports */ if (in_type == BR_MRP_TLV_HEADER_IN_TOPO) { i_dst = NULL; goto forward; } /* In all the other cases don't forward the frames */ goto no_forward; } } forward: if (p_dst) br_forward(p_dst, skb, true, false); if (s_dst) br_forward(s_dst, skb, true, false); if (i_dst) br_forward(i_dst, skb, true, false); no_forward: return 1; } /* Check if the frame was received on a port that is part of MRP ring * and if the frame has MRP eth. In that case process the frame otherwise do * normal forwarding. * note: already called with rcu_read_lock */ static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) { /* If there is no MRP instance do normal forwarding */ if (likely(!(p->flags & BR_MRP_AWARE))) goto out; return br_mrp_rcv(p, skb, p->dev); out: return 0; } bool br_mrp_enabled(struct net_bridge *br) { return !hlist_empty(&br->mrp_list); }
708 708 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include <crypto/internal/blake2s.h> #include <linux/types.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sizes.h> #include <asm/cpufeature.h> #include <asm/fpu/api.h> #include <asm/processor.h> #include <asm/simd.h> asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, const u8 *block, const size_t nblocks, const u32 inc); asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, const u8 *block, const size_t nblocks, const u32 inc); static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); void blake2s_compress(struct blake2s_state *state, const u8 *block, size_t nblocks, const u32 inc) { /* SIMD disables preemption, so relax after processing each page. */ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { blake2s_compress_generic(state, block, nblocks, inc); return; } do { const size_t blocks = min_t(size_t, nblocks, SZ_4K / BLAKE2S_BLOCK_SIZE); kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&blake2s_use_avx512)) blake2s_compress_avx512(state, block, blocks, inc); else blake2s_compress_ssse3(state, block, blocks, inc); kernel_fpu_end(); nblocks -= blocks; block += blocks * BLAKE2S_BLOCK_SIZE; } while (nblocks); } EXPORT_SYMBOL(blake2s_compress); static int __init blake2s_mod_init(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) static_branch_enable(&blake2s_use_ssse3); if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VL) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL)) static_branch_enable(&blake2s_use_avx512); return 0; } module_init(blake2s_mod_init); MODULE_LICENSE("GPL v2");
15 15 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2015 Intel Deutschland GmbH */ #include <net/mac80211.h> #include "ieee80211_i.h" #include "trace.h" #include "driver-ops.h" int drv_start(struct ieee80211_local *local) { int ret; might_sleep(); if (WARN_ON(local->started)) return -EALREADY; trace_drv_start(local); local->started = true; /* allow rx frames */ smp_mb(); ret = local->ops->start(&local->hw); trace_drv_return_int(local, ret); if (ret) local->started = false; return ret; } void drv_stop(struct ieee80211_local *local) { might_sleep(); if (WARN_ON(!local->started)) return; trace_drv_stop(local); local->ops->stop(&local->hw); trace_drv_return_void(local); /* sync away all work on the tasklet before clearing started */ tasklet_disable(&local->tasklet); tasklet_enable(&local->tasklet); barrier(); local->started = false; } int drv_add_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)))) return -EINVAL; trace_drv_add_interface(local, sdata); ret = local->ops->add_interface(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); if (ret == 0) sdata->flags |= IEEE80211_SDATA_IN_DRIVER; return ret; } int drv_change_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_change_interface(local, sdata, type, p2p); ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); trace_drv_return_int(local, ret); return ret; } void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_remove_interface(local, sdata); local->ops->remove_interface(&local->hw, &sdata->vif); sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; trace_drv_return_void(local); } __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state) { int ret = 0; might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state); if (local->ops->sta_state) { ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta, old_state, new_state); } else if (old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC) { ret = drv_sta_add(local, sdata, &sta->sta); if (ret == 0) { sta->uploaded = true; if (rcu_access_pointer(sta->sta.rates)) drv_sta_rate_tbl_update(local, sdata, &sta->sta); } } else if (old_state == IEEE80211_STA_ASSOC && new_state == IEEE80211_STA_AUTH) { drv_sta_remove(local, sdata, &sta->sta); } trace_drv_return_int(local, ret); return ret; } __must_check int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { int ret = -EOPNOTSUPP; might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_set_txpwr(local, sdata, &sta->sta); if (local->ops->sta_set_txpwr) ret = local->ops->sta_set_txpwr(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_int(local, ret); return ret; } void drv_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u32 changed) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); trace_drv_sta_rc_update(local, sdata, sta, changed); if (local->ops->sta_rc_update) local->ops->sta_rc_update(&local->hw, &sdata->vif, sta, changed); trace_drv_return_void(local); } int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 ac, const struct ieee80211_tx_queue_params *params) { int ret = -EOPNOTSUPP; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; if (params->cw_min == 0 || params->cw_min > params->cw_max) { /* * If we can't configure hardware anyway, don't warn. We may * never have initialized the CW parameters. */ WARN_ONCE(local->ops->conf_tx, "%s: invalid CW_min/CW_max: %d/%d\n", sdata->name, params->cw_min, params->cw_max); return -EINVAL; } trace_drv_conf_tx(local, sdata, ac, params); if (local->ops->conf_tx) ret = local->ops->conf_tx(&local->hw, &sdata->vif, ac, params); trace_drv_return_int(local, ret); return ret; } u64 drv_get_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { u64 ret = -1ULL; might_sleep(); if (!check_sdata_in_driver(sdata)) return ret; trace_drv_get_tsf(local, sdata); if (local->ops->get_tsf) ret = local->ops->get_tsf(&local->hw, &sdata->vif); trace_drv_return_u64(local, ret); return ret; } void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_set_tsf(local, sdata, tsf); if (local->ops->set_tsf) local->ops->set_tsf(&local->hw, &sdata->vif, tsf); trace_drv_return_void(local); } void drv_offset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, s64 offset) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_offset_tsf(local, sdata, offset); if (local->ops->offset_tsf) local->ops->offset_tsf(&local->hw, &sdata->vif, offset); trace_drv_return_void(local); } void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_reset_tsf(local, sdata); if (local->ops->reset_tsf) local->ops->reset_tsf(&local->hw, &sdata->vif); trace_drv_return_void(local); } int drv_switch_vif_chanctx(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode) { int ret = 0; int i; might_sleep(); if (!local->ops->switch_vif_chanctx) return -EOPNOTSUPP; for (i = 0; i < n_vifs; i++) { struct ieee80211_chanctx *new_ctx = container_of(vifs[i].new_ctx, struct ieee80211_chanctx, conf); struct ieee80211_chanctx *old_ctx = container_of(vifs[i].old_ctx, struct ieee80211_chanctx, conf); WARN_ON_ONCE(!old_ctx->driver_present); WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS && new_ctx->driver_present) || (mode == CHANCTX_SWMODE_REASSIGN_VIF && !new_ctx->driver_present)); } trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode); ret = local->ops->switch_vif_chanctx(&local->hw, vifs, n_vifs, mode); trace_drv_return_int(local, ret); if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) { for (i = 0; i < n_vifs; i++) { struct ieee80211_chanctx *new_ctx = container_of(vifs[i].new_ctx, struct ieee80211_chanctx, conf); struct ieee80211_chanctx *old_ctx = container_of(vifs[i].old_ctx, struct ieee80211_chanctx, conf); new_ctx->driver_present = true; old_ctx->driver_present = false; } } return ret; } int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_ampdu_params *params) { int ret = -EOPNOTSUPP; might_sleep(); if (!sdata) return -EIO; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_ampdu_action(local, sdata, params); if (local->ops->ampdu_action) ret = local->ops->ampdu_action(&local->hw, &sdata->vif, params); trace_drv_return_int(local, ret); return ret; }
31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Hash: Hash algorithms under the crypto API * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_HASH_H #define _CRYPTO_HASH_H #include <linux/crypto.h> #include <linux/string.h> struct crypto_ahash; /** * DOC: Message Digest Algorithm Definitions * * These data structures define modular message digest algorithm * implementations, managed via crypto_register_ahash(), * crypto_register_shash(), crypto_unregister_ahash() and * crypto_unregister_shash(). */ /** * struct hash_alg_common - define properties of message digest * @digestsize: Size of the result of the transformation. A buffer of this size * must be available to the @final and @finup calls, so they can * store the resulting hash into it. For various predefined sizes, * search include/crypto/ using * git grep _DIGEST_SIZE include/crypto. * @statesize: Size of the block for partial state of the transformation. A * buffer of this size must be passed to the @export function as it * will save the partial state of the transformation into it. On the * other side, the @import function will load the state from a * buffer of this size as well. * @base: Start of data structure of cipher algorithm. The common data * structure of crypto_alg contains information common to all ciphers. * The hash_alg_common data structure now adds the hash-specific * information. */ struct hash_alg_common { unsigned int digestsize; unsigned int statesize; struct crypto_alg base; }; struct ahash_request { struct crypto_async_request base; unsigned int nbytes; struct scatterlist *src; u8 *result; /* This field may only be used by the ahash API code. */ void *priv; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; /** * struct ahash_alg - asynchronous message digest definition * @init: **[mandatory]** Initialize the transformation context. Intended only to initialize the * state of the HASH transformation at the beginning. This shall fill in * the internal structures used during the entire duration of the whole * transformation. No data processing happens at this point. Driver code * implementation must not use req->result. * @update: **[mandatory]** Push a chunk of data into the driver for transformation. This * function actually pushes blocks of data from upper layers into the * driver, which then passes those to the hardware as seen fit. This * function must not finalize the HASH transformation by calculating the * final message digest as this only adds more data into the * transformation. This function shall not modify the transformation * context, as this function may be called in parallel with the same * transformation object. Data processing can happen synchronously * [SHASH] or asynchronously [AHASH] at this point. Driver must not use * req->result. * @final: **[mandatory]** Retrieve result from the driver. This function finalizes the * transformation and retrieves the resulting hash from the driver and * pushes it back to upper layers. No data processing happens at this * point unless hardware requires it to finish the transformation * (then the data buffered by the device driver is processed). * @finup: **[optional]** Combination of @update and @final. This function is effectively a * combination of @update and @final calls issued in sequence. As some * hardware cannot do @update and @final separately, this callback was * added to allow such hardware to be used at least by IPsec. Data * processing can happen synchronously [SHASH] or asynchronously [AHASH] * at this point. * @digest: Combination of @init and @update and @final. This function * effectively behaves as the entire chain of operations, @init, * @update and @final issued in sequence. Just like @finup, this was * added for hardware which cannot do even the @finup, but can only do * the whole transformation in one run. Data processing can happen * synchronously [SHASH] or asynchronously [AHASH] at this point. * @setkey: Set optional key used by the hashing algorithm. Intended to push * optional key used by the hashing algorithm from upper layers into * the driver. This function can store the key in the transformation * context or can outright program it into the hardware. In the former * case, one must be careful to program the key into the hardware at * appropriate time and one must be careful that .setkey() can be * called multiple times during the existence of the transformation * object. Not all hashing algorithms do implement this function as it * is only needed for keyed message digests. SHAx/MDx/CRCx do NOT * implement this function. HMAC(MDx)/HMAC(SHAx)/CMAC(AES) do implement * this function. This function must be called before any other of the * @init, @update, @final, @finup, @digest is called. No data * processing happens at this point. * @export: Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. Driver must not use req->result. * @import: Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. Driver must not use * req->result. * @init_tfm: Initialize the cryptographic transformation object. * This function is called only once at the instantiation * time, right after the transformation context was * allocated. In case the cryptographic hardware has * some special requirements which need to be handled * by software, this function shall check for the precise * requirement of the transformation and put any software * fallbacks in place. * @exit_tfm: Deinitialize the cryptographic transformation object. * This is a counterpart to @init_tfm, used to remove * various changes set in @init_tfm. * @halg: see struct hash_alg_common */ struct ahash_alg { int (*init)(struct ahash_request *req); int (*update)(struct ahash_request *req); int (*final)(struct ahash_request *req); int (*finup)(struct ahash_request *req); int (*digest)(struct ahash_request *req); int (*export)(struct ahash_request *req, void *out); int (*import)(struct ahash_request *req, const void *in); int (*setkey)(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); int (*init_tfm)(struct crypto_ahash *tfm); void (*exit_tfm)(struct crypto_ahash *tfm); struct hash_alg_common halg; }; struct shash_desc { struct crypto_shash *tfm; void *__ctx[] __aligned(ARCH_SLAB_MINALIGN); }; #define HASH_MAX_DIGESTSIZE 64 /* * Worst case is hmac(sha3-224-generic). Its context is a nested 'shash_desc' * containing a 'struct sha3_state'. */ #define HASH_MAX_DESCSIZE (sizeof(struct shash_desc) + 360) #define HASH_MAX_STATESIZE 512 #define SHASH_DESC_ON_STACK(shash, ctx) \ char __##shash##_desc[sizeof(struct shash_desc) + HASH_MAX_DESCSIZE] \ __aligned(__alignof__(struct shash_desc)); \ struct shash_desc *shash = (struct shash_desc *)__##shash##_desc /** * struct shash_alg - synchronous message digest definition * @init: see struct ahash_alg * @update: see struct ahash_alg * @final: see struct ahash_alg * @finup: see struct ahash_alg * @digest: see struct ahash_alg * @export: see struct ahash_alg * @import: see struct ahash_alg * @setkey: see struct ahash_alg * @init_tfm: Initialize the cryptographic transformation object. * This function is called only once at the instantiation * time, right after the transformation context was * allocated. In case the cryptographic hardware has * some special requirements which need to be handled * by software, this function shall check for the precise * requirement of the transformation and put any software * fallbacks in place. * @exit_tfm: Deinitialize the cryptographic transformation object. * This is a counterpart to @init_tfm, used to remove * various changes set in @init_tfm. * @digestsize: see struct ahash_alg * @statesize: see struct ahash_alg * @descsize: Size of the operational state for the message digest. This state * size is the memory size that needs to be allocated for * shash_desc.__ctx * @base: internally used */ struct shash_alg { int (*init)(struct shash_desc *desc); int (*update)(struct shash_desc *desc, const u8 *data, unsigned int len); int (*final)(struct shash_desc *desc, u8 *out); int (*finup)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); int (*digest)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); int (*export)(struct shash_desc *desc, void *out); int (*import)(struct shash_desc *desc, const void *in); int (*setkey)(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); int (*init_tfm)(struct crypto_shash *tfm); void (*exit_tfm)(struct crypto_shash *tfm); unsigned int descsize; /* These fields must match hash_alg_common. */ unsigned int digestsize __attribute__ ((aligned(__alignof__(struct hash_alg_common)))); unsigned int statesize; struct crypto_alg base; }; struct crypto_ahash { int (*init)(struct ahash_request *req); int (*update)(struct ahash_request *req); int (*final)(struct ahash_request *req); int (*finup)(struct ahash_request *req); int (*digest)(struct ahash_request *req); int (*export)(struct ahash_request *req, void *out); int (*import)(struct ahash_request *req, const void *in); int (*setkey)(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); unsigned int reqsize; struct crypto_tfm base; }; struct crypto_shash { unsigned int descsize; struct crypto_tfm base; }; /** * DOC: Asynchronous Message Digest API * * The asynchronous message digest API is used with the ciphers of type * CRYPTO_ALG_TYPE_AHASH (listed as type "ahash" in /proc/crypto) * * The asynchronous cipher operation discussion provided for the * CRYPTO_ALG_TYPE_SKCIPHER API applies here as well. */ static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_ahash, base); } /** * crypto_alloc_ahash() - allocate ahash cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * ahash cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an ahash. The returned struct * crypto_ahash is the cipher handle that is required for any subsequent * API invocation for that ahash. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm) { return &tfm->base; } /** * crypto_free_ahash() - zeroize and free the ahash handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_ahash(struct crypto_ahash *tfm) { crypto_destroy_tfm(tfm, crypto_ahash_tfm(tfm)); } /** * crypto_has_ahash() - Search for the availability of an ahash. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * ahash * @type: specifies the type of the ahash * @mask: specifies the mask for the ahash * * Return: true when the ahash is known to the kernel crypto API; false * otherwise */ int crypto_has_ahash(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm) { return crypto_tfm_alg_name(crypto_ahash_tfm(tfm)); } static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm) { return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm)); } static inline unsigned int crypto_ahash_alignmask( struct crypto_ahash *tfm) { return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm)); } /** * crypto_ahash_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the message digest cipher referenced with the cipher * handle is returned. * * Return: block size of cipher */ static inline unsigned int crypto_ahash_blocksize(struct crypto_ahash *tfm) { return crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm)); } static inline struct hash_alg_common *__crypto_hash_alg_common( struct crypto_alg *alg) { return container_of(alg, struct hash_alg_common, base); } static inline struct hash_alg_common *crypto_hash_alg_common( struct crypto_ahash *tfm) { return __crypto_hash_alg_common(crypto_ahash_tfm(tfm)->__crt_alg); } /** * crypto_ahash_digestsize() - obtain message digest size * @tfm: cipher handle * * The size for the message digest created by the message digest cipher * referenced with the cipher handle is returned. * * * Return: message digest size of cipher */ static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm) { return crypto_hash_alg_common(tfm)->digestsize; } /** * crypto_ahash_statesize() - obtain size of the ahash state * @tfm: cipher handle * * Return the size of the ahash state. With the crypto_ahash_export() * function, the caller can export the state into a buffer whose size is * defined with this function. * * Return: size of the ahash state */ static inline unsigned int crypto_ahash_statesize(struct crypto_ahash *tfm) { return crypto_hash_alg_common(tfm)->statesize; } static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm) { return crypto_tfm_get_flags(crypto_ahash_tfm(tfm)); } static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags) { crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags); } static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags); } /** * crypto_ahash_reqtfm() - obtain cipher handle from request * @req: asynchronous request handle that contains the reference to the ahash * cipher handle * * Return the ahash cipher handle that is registered with the asynchronous * request handle ahash_request. * * Return: ahash cipher handle */ static inline struct crypto_ahash *crypto_ahash_reqtfm( struct ahash_request *req) { return __crypto_ahash_cast(req->base.tfm); } /** * crypto_ahash_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: size of the request data */ static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm) { return tfm->reqsize; } static inline void *ahash_request_ctx(struct ahash_request *req) { return req->__ctx; } /** * crypto_ahash_setkey - set key for cipher handle * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the ahash cipher. The cipher * handle must point to a keyed hash in order for this function to succeed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); /** * crypto_ahash_finup() - update and finalize message digest * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * This function is a "short-hand" for the function calls of * crypto_ahash_update and crypto_ahash_final. The parameters have the same * meaning as discussed for those separate functions. * * Return: see crypto_ahash_final() */ int crypto_ahash_finup(struct ahash_request *req); /** * crypto_ahash_final() - calculate message digest * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * Finalize the message digest operation and create the message digest * based on all data added to the cipher handle. The message digest is placed * into the output buffer registered with the ahash_request handle. * * Return: * 0 if the message digest was successfully calculated; * -EINPROGRESS if data is fed into hardware (DMA) or queued for later; * -EBUSY if queue is full and request should be resubmitted later; * other < 0 if an error occurred */ int crypto_ahash_final(struct ahash_request *req); /** * crypto_ahash_digest() - calculate message digest for a buffer * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * This function is a "short-hand" for the function calls of crypto_ahash_init, * crypto_ahash_update and crypto_ahash_final. The parameters have the same * meaning as discussed for those separate three functions. * * Return: see crypto_ahash_final() */ int crypto_ahash_digest(struct ahash_request *req); /** * crypto_ahash_export() - extract current message digest state * @req: reference to the ahash_request handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * This function exports the hash state of the ahash_request handle into the * caller-allocated output buffer out which must have sufficient size (e.g. by * calling crypto_ahash_statesize()). * * Return: 0 if the export was successful; < 0 if an error occurred */ static inline int crypto_ahash_export(struct ahash_request *req, void *out) { return crypto_ahash_reqtfm(req)->export(req, out); } /** * crypto_ahash_import() - import message digest state * @req: reference to ahash_request handle the state is imported into * @in: buffer holding the state * * This function imports the hash state into the ahash_request handle from the * input buffer. That buffer should have been generated with the * crypto_ahash_export function. * * Return: 0 if the import was successful; < 0 if an error occurred */ static inline int crypto_ahash_import(struct ahash_request *req, const void *in) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return tfm->import(req, in); } /** * crypto_ahash_init() - (re)initialize message digest handle * @req: ahash_request handle that already is initialized with all necessary * data using the ahash_request_* API functions * * The call (re-)initializes the message digest referenced by the ahash_request * handle. Any potentially existing state created by previous operations is * discarded. * * Return: see crypto_ahash_final() */ static inline int crypto_ahash_init(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return tfm->init(req); } /** * crypto_ahash_update() - add data to message digest for processing * @req: ahash_request handle that was previously initialized with the * crypto_ahash_init call. * * Updates the message digest state of the &ahash_request handle. The input data * is pointed to by the scatter/gather list registered in the &ahash_request * handle * * Return: see crypto_ahash_final() */ static inline int crypto_ahash_update(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct crypto_alg *alg = tfm->base.__crt_alg; unsigned int nbytes = req->nbytes; int ret; crypto_stats_get(alg); ret = crypto_ahash_reqtfm(req)->update(req); crypto_stats_ahash_update(nbytes, ret, alg); return ret; } /** * DOC: Asynchronous Hash Request Handle * * The &ahash_request data structure contains all pointers to data * required for the asynchronous cipher operation. This includes the cipher * handle (which can be used by multiple &ahash_request instances), pointer * to plaintext and the message digest output buffer, asynchronous callback * function, etc. It acts as a handle to the ahash_request_* API calls in a * similar way as ahash handle to the crypto_ahash_* API calls. */ /** * ahash_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing ahash handle in the request * data structure with a different one. */ static inline void ahash_request_set_tfm(struct ahash_request *req, struct crypto_ahash *tfm) { req->base.tfm = crypto_ahash_tfm(tfm); } /** * ahash_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the ahash * message digest API calls. During * the allocation, the provided ahash handle * is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct ahash_request *ahash_request_alloc( struct crypto_ahash *tfm, gfp_t gfp) { struct ahash_request *req; req = kmalloc(sizeof(struct ahash_request) + crypto_ahash_reqsize(tfm), gfp); if (likely(req)) ahash_request_set_tfm(req, tfm); return req; } /** * ahash_request_free() - zeroize and free the request data structure * @req: request data structure cipher handle to be freed */ static inline void ahash_request_free(struct ahash_request *req) { kfree_sensitive(req); } static inline void ahash_request_zero(struct ahash_request *req) { memzero_explicit(req, sizeof(*req) + crypto_ahash_reqsize(crypto_ahash_reqtfm(req))); } static inline struct ahash_request *ahash_request_cast( struct crypto_async_request *req) { return container_of(req, struct ahash_request, base); } /** * ahash_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * &crypto_async_request data structure provided to the callback function. * * This function allows setting the callback function that is triggered once * the cipher operation completes. * * The callback function is registered with the &ahash_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void ahash_request_set_callback(struct ahash_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * ahash_request_set_crypt() - set data buffers * @req: ahash_request handle to be updated * @src: source scatter/gather list * @result: buffer that is filled with the message digest -- the caller must * ensure that the buffer has sufficient space by, for example, calling * crypto_ahash_digestsize() * @nbytes: number of bytes to process from the source scatter/gather list * * By using this call, the caller references the source scatter/gather list. * The source scatter/gather list points to the data the message digest is to * be calculated for. */ static inline void ahash_request_set_crypt(struct ahash_request *req, struct scatterlist *src, u8 *result, unsigned int nbytes) { req->src = src; req->nbytes = nbytes; req->result = result; } /** * DOC: Synchronous Message Digest API * * The synchronous message digest API is used with the ciphers of type * CRYPTO_ALG_TYPE_SHASH (listed as type "shash" in /proc/crypto) * * The message digest API is able to maintain state information for the * caller. * * The synchronous message digest API can store user-related context in its * shash_desc request data structure. */ /** * crypto_alloc_shash() - allocate message digest handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * message digest cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for a message digest. The returned &struct * crypto_shash is the cipher handle that is required for any subsequent * API invocation for that message digest. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm) { return &tfm->base; } /** * crypto_free_shash() - zeroize and free the message digest handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_shash(struct crypto_shash *tfm) { crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm)); } static inline const char *crypto_shash_alg_name(struct crypto_shash *tfm) { return crypto_tfm_alg_name(crypto_shash_tfm(tfm)); } static inline const char *crypto_shash_driver_name(struct crypto_shash *tfm) { return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); } static inline unsigned int crypto_shash_alignmask( struct crypto_shash *tfm) { return crypto_tfm_alg_alignmask(crypto_shash_tfm(tfm)); } /** * crypto_shash_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the message digest cipher referenced with the cipher * handle is returned. * * Return: block size of cipher */ static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm) { return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm)); } static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg) { return container_of(alg, struct shash_alg, base); } static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm) { return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg); } /** * crypto_shash_digestsize() - obtain message digest size * @tfm: cipher handle * * The size for the message digest created by the message digest cipher * referenced with the cipher handle is returned. * * Return: digest size of cipher */ static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->digestsize; } static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->statesize; } static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm) { return crypto_tfm_get_flags(crypto_shash_tfm(tfm)); } static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags) { crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags); } static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags); } /** * crypto_shash_descsize() - obtain the operational state size * @tfm: cipher handle * * The size of the operational state the cipher needs during operation is * returned for the hash referenced with the cipher handle. This size is * required to calculate the memory requirements to allow the caller allocating * sufficient memory for operational state. * * The operational state is defined with struct shash_desc where the size of * that data structure is to be calculated as * sizeof(struct shash_desc) + crypto_shash_descsize(alg) * * Return: size of the operational state */ static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm) { return tfm->descsize; } static inline void *shash_desc_ctx(struct shash_desc *desc) { return desc->__ctx; } /** * crypto_shash_setkey() - set key for message digest * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the keyed message digest cipher. The * cipher handle must point to a keyed message digest cipher in order for this * function to succeed. * * Context: Any context. * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); /** * crypto_shash_digest() - calculate message digest for buffer * @desc: see crypto_shash_final() * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This function is a "short-hand" for the function calls of crypto_shash_init, * crypto_shash_update and crypto_shash_final. The parameters have the same * meaning as discussed for those separate three functions. * * Context: Any context. * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); /** * crypto_shash_tfm_digest() - calculate message digest for buffer * @tfm: hash transformation object * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This is a simplified version of crypto_shash_digest() for users who don't * want to allocate their own hash descriptor (shash_desc). Instead, * crypto_shash_tfm_digest() takes a hash transformation object (crypto_shash) * directly, and it allocates a hash descriptor on the stack internally. * Note that this stack allocation may be fairly large. * * Context: Any context. * Return: 0 on success; < 0 if an error occurred. */ int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data, unsigned int len, u8 *out); /** * crypto_shash_export() - extract operational state for message digest * @desc: reference to the operational state handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * This function exports the hash state of the operational state handle into the * caller-allocated output buffer out which must have sufficient size (e.g. by * calling crypto_shash_descsize). * * Context: Any context. * Return: 0 if the export creation was successful; < 0 if an error occurred */ static inline int crypto_shash_export(struct shash_desc *desc, void *out) { return crypto_shash_alg(desc->tfm)->export(desc, out); } /** * crypto_shash_import() - import operational state * @desc: reference to the operational state handle the state imported into * @in: buffer holding the state * * This function imports the hash state into the operational state handle from * the input buffer. That buffer should have been generated with the * crypto_ahash_export function. * * Context: Any context. * Return: 0 if the import was successful; < 0 if an error occurred */ static inline int crypto_shash_import(struct shash_desc *desc, const void *in) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_shash_alg(tfm)->import(desc, in); } /** * crypto_shash_init() - (re)initialize message digest * @desc: operational state handle that is already filled * * The call (re-)initializes the message digest referenced by the * operational state handle. Any potentially existing state created by * previous operations is discarded. * * Context: Any context. * Return: 0 if the message digest initialization was successful; < 0 if an * error occurred */ static inline int crypto_shash_init(struct shash_desc *desc) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_shash_alg(tfm)->init(desc); } /** * crypto_shash_update() - add data to message digest for processing * @desc: operational state handle that is already initialized * @data: input data to be added to the message digest * @len: length of the input data * * Updates the message digest state of the operational state handle. * * Context: Any context. * Return: 0 if the message digest update was successful; < 0 if an error * occurred */ int crypto_shash_update(struct shash_desc *desc, const u8 *data, unsigned int len); /** * crypto_shash_final() - calculate message digest * @desc: operational state handle that is already filled with data * @out: output buffer filled with the message digest * * Finalize the message digest operation and create the message digest * based on all data added to the cipher handle. The message digest is placed * into the output buffer. The caller must ensure that the output buffer is * large enough by using crypto_shash_digestsize. * * Context: Any context. * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_final(struct shash_desc *desc, u8 *out); /** * crypto_shash_finup() - calculate message digest of buffer * @desc: see crypto_shash_final() * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This function is a "short-hand" for the function calls of * crypto_shash_update and crypto_shash_final. The parameters have the same * meaning as discussed for those separate functions. * * Context: Any context. * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); static inline void shash_desc_zero(struct shash_desc *desc) { memzero_explicit(desc, sizeof(*desc) + crypto_shash_descsize(desc->tfm)); } #endif /* _CRYPTO_HASH_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 // SPDX-License-Identifier: GPL-2.0-or-later /* * Sysfs attributes of bridge ports * Linux ethernet bridge * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_bridge.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/sched/signal.h> #include "br_private.h" /* IMPORTANT: new bridge port options must be added with netlink support only * please do not add new sysfs entries */ struct brport_attribute { struct attribute attr; ssize_t (*show)(struct net_bridge_port *, char *); int (*store)(struct net_bridge_port *, unsigned long); int (*store_raw)(struct net_bridge_port *, char *); }; #define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \ const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ .show = _show, \ .store_raw = _store, \ }; #define BRPORT_ATTR(_name, _mode, _show, _store) \ const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ .show = _show, \ .store = _store, \ }; #define BRPORT_ATTR_FLAG(_name, _mask) \ static ssize_t show_##_name(struct net_bridge_port *p, char *buf) \ { \ return sprintf(buf, "%d\n", !!(p->flags & _mask)); \ } \ static int store_##_name(struct net_bridge_port *p, unsigned long v) \ { \ return store_flag(p, v, _mask); \ } \ static BRPORT_ATTR(_name, 0644, \ show_##_name, store_##_name) static int store_flag(struct net_bridge_port *p, unsigned long v, unsigned long mask) { struct netlink_ext_ack extack = {0}; unsigned long flags = p->flags; int err; if (v) flags |= mask; else flags &= ~mask; if (flags != p->flags) { err = br_switchdev_set_port_flag(p, flags, mask, &extack); if (err) { netdev_err(p->dev, "%s\n", extack._msg); return err; } p->flags = flags; br_port_flags_change(p, mask); } return 0; } static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->path_cost); } static BRPORT_ATTR(path_cost, 0644, show_path_cost, br_stp_set_path_cost); static ssize_t show_priority(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->priority); } static BRPORT_ATTR(priority, 0644, show_priority, br_stp_set_port_priority); static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_root); } static BRPORT_ATTR(designated_root, 0444, show_designated_root, NULL); static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_bridge); } static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL); static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_port); } static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL); static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_cost); } static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL); static ssize_t show_port_id(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_id); } static BRPORT_ATTR(port_id, 0444, show_port_id, NULL); static ssize_t show_port_no(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_no); } static BRPORT_ATTR(port_no, 0444, show_port_no, NULL); static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->topology_change_ack); } static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL); static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->config_pending); } static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL); static ssize_t show_port_state(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->state); } static BRPORT_ATTR(state, 0444, show_port_state, NULL); static ssize_t show_message_age_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); } static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL); static ssize_t show_forward_delay_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); } static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL); static ssize_t show_hold_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); } static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL); static int store_flush(struct net_bridge_port *p, unsigned long v) { br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry return 0; } static BRPORT_ATTR(flush, 0200, NULL, store_flush); static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%#x\n", p->group_fwd_mask); } static int store_group_fwd_mask(struct net_bridge_port *p, unsigned long v) { if (v & BR_GROUPFWD_MACPAUSE) return -EINVAL; p->group_fwd_mask = v; return 0; } static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask, store_group_fwd_mask); static ssize_t show_backup_port(struct net_bridge_port *p, char *buf) { struct net_bridge_port *backup_p; int ret = 0; rcu_read_lock(); backup_p = rcu_dereference(p->backup_port); if (backup_p) ret = sprintf(buf, "%s\n", backup_p->dev->name); rcu_read_unlock(); return ret; } static int store_backup_port(struct net_bridge_port *p, char *buf) { struct net_device *backup_dev = NULL; char *nl = strchr(buf, '\n'); if (nl) *nl = '\0'; if (strlen(buf) > 0) { backup_dev = __dev_get_by_name(dev_net(p->dev), buf); if (!backup_dev) return -ENOENT; } return nbp_backup_change(p, backup_dev); } static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port); BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS); BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router); } static int store_multicast_router(struct net_bridge_port *p, unsigned long v) { return br_multicast_set_port_router(&p->multicast_ctx, v); } static BRPORT_ATTR(multicast_router, 0644, show_multicast_router, store_multicast_router); BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE); BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST); #endif static const struct brport_attribute *brport_attrs[] = { &brport_attr_path_cost, &brport_attr_priority, &brport_attr_port_id, &brport_attr_port_no, &brport_attr_designated_root, &brport_attr_designated_bridge, &brport_attr_designated_port, &brport_attr_designated_cost, &brport_attr_state, &brport_attr_change_ack, &brport_attr_config_pending, &brport_attr_message_age_timer, &brport_attr_forward_delay_timer, &brport_attr_hold_timer, &brport_attr_flush, &brport_attr_hairpin_mode, &brport_attr_bpdu_guard, &brport_attr_root_block, &brport_attr_learning, &brport_attr_unicast_flood, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, &brport_attr_multicast_to_unicast, #endif &brport_attr_proxyarp, &brport_attr_proxyarp_wifi, &brport_attr_multicast_flood, &brport_attr_broadcast_flood, &brport_attr_group_fwd_mask, &brport_attr_neigh_suppress, &brport_attr_isolated, &brport_attr_backup_port, NULL }; #define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) static ssize_t brport_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct brport_attribute *brport_attr = to_brport_attr(attr); struct net_bridge_port *p = kobj_to_brport(kobj); if (!brport_attr->show) return -EINVAL; return brport_attr->show(p, buf); } static ssize_t brport_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct brport_attribute *brport_attr = to_brport_attr(attr); struct net_bridge_port *p = kobj_to_brport(kobj); ssize_t ret = -EINVAL; unsigned long val; char *endp; if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!rtnl_trylock()) return restart_syscall(); if (brport_attr->store_raw) { char *buf_copy; buf_copy = kstrndup(buf, count, GFP_KERNEL); if (!buf_copy) { ret = -ENOMEM; goto out_unlock; } spin_lock_bh(&p->br->lock); ret = brport_attr->store_raw(p, buf_copy); spin_unlock_bh(&p->br->lock); kfree(buf_copy); } else if (brport_attr->store) { val = simple_strtoul(buf, &endp, 0); if (endp == buf) goto out_unlock; spin_lock_bh(&p->br->lock); ret = brport_attr->store(p, val); spin_unlock_bh(&p->br->lock); } if (!ret) { br_ifinfo_notify(RTM_NEWLINK, NULL, p); ret = count; } out_unlock: rtnl_unlock(); return ret; } const struct sysfs_ops brport_sysfs_ops = { .show = brport_show, .store = brport_store, }; /* * Add sysfs entries to ethernet device added to a bridge. * Creates a brport subdirectory with bridge attributes. * Puts symlink in bridge's brif subdirectory */ int br_sysfs_addif(struct net_bridge_port *p) { struct net_bridge *br = p->br; const struct brport_attribute **a; int err; err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj, SYSFS_BRIDGE_PORT_LINK); if (err) return err; for (a = brport_attrs; *a; ++a) { err = sysfs_create_file(&p->kobj, &((*a)->attr)); if (err) return err; } strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ); return sysfs_create_link(br->ifobj, &p->kobj, p->sysfs_name); } /* Rename bridge's brif symlink */ int br_sysfs_renameif(struct net_bridge_port *p) { struct net_bridge *br = p->br; int err; /* If a rename fails, the rollback will cause another * rename call with the existing name. */ if (!strncmp(p->sysfs_name, p->dev->name, IFNAMSIZ)) return 0; err = sysfs_rename_link(br->ifobj, &p->kobj, p->sysfs_name, p->dev->name); if (err) netdev_notice(br->dev, "unable to rename link %s to %s", p->sysfs_name, p->dev->name); else strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ); return err; }
3 563 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0-or-later /* * UDPLITEv6 An implementation of the UDP-Lite protocol over IPv6. * See also net/ipv4/udplite.c * * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk> * * Changes: * Fixes: */ #include <linux/export.h> #include <linux/proc_fs.h> #include "udp_impl.h" static int udplitev6_rcv(struct sk_buff *skb) { return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } static int udplitev6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { return __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table); } static const struct inet6_protocol udplitev6_protocol = { .handler = udplitev6_rcv, .err_handler = udplitev6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; struct proto udplitev6_prot = { .name = "UDPLITEv6", .owner = THIS_MODULE, .close = udp_lib_close, .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udplite_sk_init, .destroy = udpv6_destroy_sock, .setsockopt = udpv6_setsockopt, .getsockopt = udpv6_getsockopt, .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp6_sock), .h.udp_table = &udplite_table, }; static struct inet_protosw udplite6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_UDPLITE, .prot = &udplitev6_prot, .ops = &inet6_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }; int __init udplitev6_init(void) { int ret; ret = inet6_add_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); if (ret) goto out; ret = inet6_register_protosw(&udplite6_protosw); if (ret) goto out_udplitev6_protocol; out: return ret; out_udplitev6_protocol: inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); goto out; } void udplitev6_exit(void) { inet6_unregister_protosw(&udplite6_protosw); inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); } #ifdef CONFIG_PROC_FS static struct udp_seq_afinfo udplite6_seq_afinfo = { .family = AF_INET6, .udp_table = &udplite_table, }; static int __net_init udplite6_proc_init_net(struct net *net) { if (!proc_create_net_data("udplite6", 0444, net->proc_net, &udp6_seq_ops, sizeof(struct udp_iter_state), &udplite6_seq_afinfo)) return -ENOMEM; return 0; } static void __net_exit udplite6_proc_exit_net(struct net *net) { remove_proc_entry("udplite6", net->proc_net); } static struct pernet_operations udplite6_net_ops = { .init = udplite6_proc_init_net, .exit = udplite6_proc_exit_net, }; int __init udplite6_proc_init(void) { return register_pernet_subsys(&udplite6_net_ops); } void udplite6_proc_exit(void) { unregister_pernet_subsys(&udplite6_net_ops); } #endif
1090 1084 1083 564 564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 // SPDX-License-Identifier: GPL-2.0-only /* Cluster IP hashmark target * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> * based on ideas of Fabio Olive Leite <olive@unixforge.org> * * Development of this code funded by SuSE Linux AG, https://www.suse.com/ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/jhash.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmp.h> #include <linux/if_arp.h> #include <linux/seq_file.h> #include <linux/refcount.h> #include <linux/netfilter_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/checksum.h> #include <net/ip.h> #define CLUSTERIP_VERSION "0.8" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); struct clusterip_config { struct list_head list; /* list of all configs */ refcount_t refcount; /* reference count */ refcount_t entries; /* number of entries/rules * referencing us */ __be32 clusterip; /* the IP address */ u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ int ifindex; /* device ifindex */ u_int16_t num_total_nodes; /* total number of nodes */ unsigned long local_nodes; /* node number array */ #ifdef CONFIG_PROC_FS struct proc_dir_entry *pde; /* proc dir entry */ #endif enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ struct rcu_head rcu; /* for call_rcu */ struct net *net; /* netns for pernet list */ char ifname[IFNAMSIZ]; /* device ifname */ }; #ifdef CONFIG_PROC_FS static const struct proc_ops clusterip_proc_ops; #endif struct clusterip_net { struct list_head configs; /* lock protects the configs list */ spinlock_t lock; bool clusterip_deprecated_warning; #ifdef CONFIG_PROC_FS struct proc_dir_entry *procdir; /* mutex protects the config->pde*/ struct mutex mutex; #endif unsigned int hook_users; }; static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); static const struct nf_hook_ops cip_arp_ops = { .hook = clusterip_arp_mangle, .pf = NFPROTO_ARP, .hooknum = NF_ARP_OUT, .priority = -1 }; static unsigned int clusterip_net_id __read_mostly; static inline struct clusterip_net *clusterip_pernet(struct net *net) { return net_generic(net, clusterip_net_id); } static inline void clusterip_config_get(struct clusterip_config *c) { refcount_inc(&c->refcount); } static void clusterip_config_rcu_free(struct rcu_head *head) { struct clusterip_config *config; struct net_device *dev; config = container_of(head, struct clusterip_config, rcu); dev = dev_get_by_name(config->net, config->ifname); if (dev) { dev_mc_del(dev, config->clustermac); dev_put(dev); } kfree(config); } static inline void clusterip_config_put(struct clusterip_config *c) { if (refcount_dec_and_test(&c->refcount)) call_rcu(&c->rcu, clusterip_config_rcu_free); } /* decrease the count of entries using/referencing this config. If last * entry(rule) is removed, remove the config from lists, but don't free it * yet, since proc-files could still be holding references */ static inline void clusterip_config_entry_put(struct clusterip_config *c) { struct clusterip_net *cn = clusterip_pernet(c->net); local_bh_disable(); if (refcount_dec_and_lock(&c->entries, &cn->lock)) { list_del_rcu(&c->list); spin_unlock(&cn->lock); local_bh_enable(); /* In case anyone still accesses the file, the open/close * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS mutex_lock(&cn->mutex); if (cn->procdir) proc_remove(c->pde); mutex_unlock(&cn->mutex); #endif return; } local_bh_enable(); } static struct clusterip_config * __clusterip_config_find(struct net *net, __be32 clusterip) { struct clusterip_config *c; struct clusterip_net *cn = clusterip_pernet(net); list_for_each_entry_rcu(c, &cn->configs, list) { if (c->clusterip == clusterip) return c; } return NULL; } static inline struct clusterip_config * clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) { struct clusterip_config *c; rcu_read_lock_bh(); c = __clusterip_config_find(net, clusterip); if (c) { #ifdef CONFIG_PROC_FS if (!c->pde) c = NULL; else #endif if (unlikely(!refcount_inc_not_zero(&c->refcount))) c = NULL; else if (entry) { if (unlikely(!refcount_inc_not_zero(&c->entries))) { clusterip_config_put(c); c = NULL; } } } rcu_read_unlock_bh(); return c; } static void clusterip_config_init_nodelist(struct clusterip_config *c, const struct ipt_clusterip_tgt_info *i) { int n; for (n = 0; n < i->num_local_nodes; n++) set_bit(i->local_nodes[n] - 1, &c->local_nodes); } static int clusterip_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; spin_lock_bh(&cn->lock); list_for_each_entry_rcu(c, &cn->configs, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, c->ifname)) { c->ifindex = dev->ifindex; dev_mc_add(dev, c->clustermac); } break; case NETDEV_UNREGISTER: if (dev->ifindex == c->ifindex) { dev_mc_del(dev, c->clustermac); c->ifindex = -1; } break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, c->ifname)) { c->ifindex = dev->ifindex; dev_mc_add(dev, c->clustermac); } else if (dev->ifindex == c->ifindex) { dev_mc_del(dev, c->clustermac); c->ifindex = -1; } break; } } spin_unlock_bh(&cn->lock); return NOTIFY_DONE; } static struct clusterip_config * clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, __be32 ip, const char *iniface) { struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; struct net_device *dev; int err; if (iniface[0] == '\0') { pr_info("Please specify an interface name\n"); return ERR_PTR(-EINVAL); } c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return ERR_PTR(-ENOMEM); dev = dev_get_by_name(net, iniface); if (!dev) { pr_info("no such interface %s\n", iniface); kfree(c); return ERR_PTR(-ENOENT); } c->ifindex = dev->ifindex; strcpy(c->ifname, dev->name); memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); dev_mc_add(dev, c->clustermac); dev_put(dev); c->clusterip = ip; c->num_total_nodes = i->num_total_nodes; clusterip_config_init_nodelist(c, i); c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; c->net = net; refcount_set(&c->refcount, 1); spin_lock_bh(&cn->lock); if (__clusterip_config_find(net, ip)) { err = -EBUSY; goto out_config_put; } list_add_rcu(&c->list, &cn->configs); spin_unlock_bh(&cn->lock); #ifdef CONFIG_PROC_FS { char buffer[16]; /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); mutex_lock(&cn->mutex); c->pde = proc_create_data(buffer, 0600, cn->procdir, &clusterip_proc_ops, c); mutex_unlock(&cn->mutex); if (!c->pde) { err = -ENOMEM; goto err; } } #endif refcount_set(&c->entries, 1); return c; #ifdef CONFIG_PROC_FS err: #endif spin_lock_bh(&cn->lock); list_del_rcu(&c->list); out_config_put: spin_unlock_bh(&cn->lock); clusterip_config_put(c); return ERR_PTR(err); } #ifdef CONFIG_PROC_FS static int clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) { if (nodenum == 0 || nodenum > c->num_total_nodes) return 1; /* check if we already have this number in our bitfield */ if (test_and_set_bit(nodenum - 1, &c->local_nodes)) return 1; return 0; } static bool clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) { if (nodenum == 0 || nodenum > c->num_total_nodes) return true; if (test_and_clear_bit(nodenum - 1, &c->local_nodes)) return false; return true; } #endif static inline u_int32_t clusterip_hashfn(const struct sk_buff *skb, const struct clusterip_config *config) { const struct iphdr *iph = ip_hdr(skb); unsigned long hashval; u_int16_t sport = 0, dport = 0; int poff; poff = proto_ports_offset(iph->protocol); if (poff >= 0) { const u_int16_t *ports; u16 _ports[2]; ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); if (ports) { sport = ports[0]; dport = ports[1]; } } else { net_info_ratelimited("unknown protocol %u\n", iph->protocol); } switch (config->hash_mode) { case CLUSTERIP_HASHMODE_SIP: hashval = jhash_1word(ntohl(iph->saddr), config->hash_initval); break; case CLUSTERIP_HASHMODE_SIP_SPT: hashval = jhash_2words(ntohl(iph->saddr), sport, config->hash_initval); break; case CLUSTERIP_HASHMODE_SIP_SPT_DPT: hashval = jhash_3words(ntohl(iph->saddr), sport, dport, config->hash_initval); break; default: /* to make gcc happy */ hashval = 0; /* This cannot happen, unless the check function wasn't called * at rule load time */ pr_info("unknown mode %u\n", config->hash_mode); BUG(); break; } /* node numbers are 1..n, not 0..n */ return reciprocal_scale(hashval, config->num_total_nodes) + 1; } static inline int clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) { return test_bit(hash - 1, &config->local_nodes); } /*********************************************************************** * IPTABLES TARGET ***********************************************************************/ static unsigned int clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int32_t hash; /* don't need to clusterip_config_get() here, since refcount * is only decremented by destroy() - and ip_tables guarantees * that the ->target() function isn't called after ->destroy() */ ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return NF_DROP; /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ if (ip_hdr(skb)->protocol == IPPROTO_ICMP && (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY)) return XT_CONTINUE; /* nf_conntrack_proto_icmp guarantees us that we only have ICMP_ECHO, * TIMESTAMP, INFO_REQUEST or ICMP_ADDRESS type icmp packets from here * on, which all have an ID field [relevant for hashing]. */ hash = clusterip_hashfn(skb, cipinfo->config); switch (ctinfo) { case IP_CT_NEW: WRITE_ONCE(ct->mark, hash); break; case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* FIXME: we don't handle expectations at the moment. * They can arrive on a different node than * the master connection (e.g. FTP passive mode) */ case IP_CT_ESTABLISHED: case IP_CT_ESTABLISHED_REPLY: break; default: /* Prevent gcc warnings */ break; } #ifdef DEBUG nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark)); if (!clusterip_responsible(cipinfo->config, hash)) { pr_debug("not responsible\n"); return NF_DROP; } pr_debug("responsible\n"); /* despite being received via linklayer multicast, this is * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ skb->pkt_type = PACKET_HOST; return XT_CONTINUE; } static int clusterip_tg_check(const struct xt_tgchk_param *par) { struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct clusterip_net *cn = clusterip_pernet(par->net); const struct ipt_entry *e = par->entryinfo; struct clusterip_config *config; int ret, i; if (par->nft_compat) { pr_err("cannot use CLUSTERIP target from nftables compat\n"); return -EOPNOTSUPP; } if (cn->hook_users == UINT_MAX) return -EOVERFLOW; if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { pr_info("unknown mode %u\n", cipinfo->hash_mode); return -EINVAL; } if (e->ip.dmsk.s_addr != htonl(0xffffffff) || e->ip.dst.s_addr == 0) { pr_info("Please specify destination IP\n"); return -EINVAL; } if (cipinfo->num_local_nodes > ARRAY_SIZE(cipinfo->local_nodes)) { pr_info("bad num_local_nodes %u\n", cipinfo->num_local_nodes); return -EINVAL; } for (i = 0; i < cipinfo->num_local_nodes; i++) { if (cipinfo->local_nodes[i] - 1 >= sizeof(config->local_nodes) * 8) { pr_info("bad local_nodes[%d] %u\n", i, cipinfo->local_nodes[i]); return -EINVAL; } } config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1); if (!config) { if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { pr_info("no config found for %pI4, need 'new'\n", &e->ip.dst.s_addr); return -EINVAL; } else { config = clusterip_config_init(par->net, cipinfo, e->ip.dst.s_addr, e->ip.iniface); if (IS_ERR(config)) return PTR_ERR(config); } } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) { clusterip_config_entry_put(config); clusterip_config_put(config); return -EINVAL; } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info("cannot load conntrack support for proto=%u\n", par->family); clusterip_config_entry_put(config); clusterip_config_put(config); return ret; } if (cn->hook_users == 0) { ret = nf_register_net_hook(par->net, &cip_arp_ops); if (ret < 0) { clusterip_config_entry_put(config); clusterip_config_put(config); nf_ct_netns_put(par->net, par->family); return ret; } } cn->hook_users++; if (!cn->clusterip_deprecated_warning) { pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " "use xt_cluster instead\n"); cn->clusterip_deprecated_warning = true; } cipinfo->config = config; return ret; } /* drop reference count of cluster config when rule is deleted */ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) { const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct clusterip_net *cn = clusterip_pernet(par->net); /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ clusterip_config_entry_put(cipinfo->config); clusterip_config_put(cipinfo->config); nf_ct_netns_put(par->net, par->family); cn->hook_users--; if (cn->hook_users == 0) nf_unregister_net_hook(par->net, &cip_arp_ops); } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ipt_clusterip_tgt_info { u_int32_t flags; u_int8_t clustermac[6]; u_int16_t num_total_nodes; u_int16_t num_local_nodes; u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; u_int32_t hash_mode; u_int32_t hash_initval; compat_uptr_t config; }; #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", .family = NFPROTO_IPV4, .target = clusterip_tg, .checkentry = clusterip_tg_check, .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), .usersize = offsetof(struct ipt_clusterip_tgt_info, config), #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ .me = THIS_MODULE }; /*********************************************************************** * ARP MANGLING CODE ***********************************************************************/ /* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ struct arp_payload { u_int8_t src_hw[ETH_ALEN]; __be32 src_ip; u_int8_t dst_hw[ETH_ALEN]; __be32 dst_ip; } __packed; #ifdef DEBUG static void arp_print(struct arp_payload *payload) { #define HBUFFERLEN 30 char hbuffer[HBUFFERLEN]; int j, k; for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) { hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); hbuffer[k++] = ':'; } hbuffer[--k] = '\0'; pr_debug("src %pI4@%s, dst %pI4\n", &payload->src_ip, hbuffer, &payload->dst_ip); } #endif static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; struct net *net = state->net; /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) return NF_ACCEPT; /* we only want to mangle arp requests and replies */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) return NF_ACCEPT; payload = (void *)(arp+1); /* if there is no clusterip configuration for the arp reply's * source ip, we don't want to mangle it */ c = clusterip_config_find_get(net, payload->src_ip, 0); if (!c) return NF_ACCEPT; /* normally the linux kernel always replies to arp queries of * addresses on different interfacs. However, in the CLUSTERIP case * this wouldn't work, since we didn't subscribe the mcast group on * other interfaces */ if (c->ifindex != state->out->ifindex) { pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n", c->ifindex, state->out->ifindex); clusterip_config_put(c); return NF_ACCEPT; } /* mangle reply hardware address */ memcpy(payload->src_hw, c->clustermac, arp->ar_hln); #ifdef DEBUG pr_debug("mangled arp reply: "); arp_print(payload); #endif clusterip_config_put(c); return NF_ACCEPT; } /*********************************************************************** * PROC DIR HANDLING ***********************************************************************/ #ifdef CONFIG_PROC_FS struct clusterip_seq_position { unsigned int pos; /* position */ unsigned int weight; /* number of bits set == size */ unsigned int bit; /* current bit */ unsigned long val; /* current value */ }; static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) { struct clusterip_config *c = s->private; unsigned int weight; u_int32_t local_nodes; struct clusterip_seq_position *idx; /* FIXME: possible race */ local_nodes = c->local_nodes; weight = hweight32(local_nodes); if (*pos >= weight) return NULL; idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL); if (!idx) return ERR_PTR(-ENOMEM); idx->pos = *pos; idx->weight = weight; idx->bit = ffs(local_nodes); idx->val = local_nodes; clear_bit(idx->bit - 1, &idx->val); return idx; } static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct clusterip_seq_position *idx = v; *pos = ++idx->pos; if (*pos >= idx->weight) { kfree(v); return NULL; } idx->bit = ffs(idx->val); clear_bit(idx->bit - 1, &idx->val); return idx; } static void clusterip_seq_stop(struct seq_file *s, void *v) { if (!IS_ERR(v)) kfree(v); } static int clusterip_seq_show(struct seq_file *s, void *v) { struct clusterip_seq_position *idx = v; if (idx->pos != 0) seq_putc(s, ','); seq_printf(s, "%u", idx->bit); if (idx->pos == idx->weight - 1) seq_putc(s, '\n'); return 0; } static const struct seq_operations clusterip_seq_ops = { .start = clusterip_seq_start, .next = clusterip_seq_next, .stop = clusterip_seq_stop, .show = clusterip_seq_show, }; static int clusterip_proc_open(struct inode *inode, struct file *file) { int ret = seq_open(file, &clusterip_seq_ops); if (!ret) { struct seq_file *sf = file->private_data; struct clusterip_config *c = PDE_DATA(inode); sf->private = c; clusterip_config_get(c); } return ret; } static int clusterip_proc_release(struct inode *inode, struct file *file) { struct clusterip_config *c = PDE_DATA(inode); int ret; ret = seq_release(inode, file); if (!ret) clusterip_config_put(c); return ret; } static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { struct clusterip_config *c = PDE_DATA(file_inode(file)); #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; unsigned long nodenum; int rc; if (size > PROC_WRITELEN) return -EIO; if (copy_from_user(buffer, input, size)) return -EFAULT; buffer[size] = 0; if (*buffer == '+') { rc = kstrtoul(buffer+1, 10, &nodenum); if (rc) return rc; if (clusterip_add_node(c, nodenum)) return -ENOMEM; } else if (*buffer == '-') { rc = kstrtoul(buffer+1, 10, &nodenum); if (rc) return rc; if (clusterip_del_node(c, nodenum)) return -ENOENT; } else return -EIO; return size; } static const struct proc_ops clusterip_proc_ops = { .proc_open = clusterip_proc_open, .proc_read = seq_read, .proc_write = clusterip_proc_write, .proc_lseek = seq_lseek, .proc_release = clusterip_proc_release, }; #endif /* CONFIG_PROC_FS */ static int clusterip_net_init(struct net *net) { struct clusterip_net *cn = clusterip_pernet(net); INIT_LIST_HEAD(&cn->configs); spin_lock_init(&cn->lock); #ifdef CONFIG_PROC_FS cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); if (!cn->procdir) { pr_err("Unable to proc dir entry\n"); return -ENOMEM; } mutex_init(&cn->mutex); #endif /* CONFIG_PROC_FS */ return 0; } static void clusterip_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS struct clusterip_net *cn = clusterip_pernet(net); mutex_lock(&cn->mutex); proc_remove(cn->procdir); cn->procdir = NULL; mutex_unlock(&cn->mutex); #endif } static struct pernet_operations clusterip_net_ops = { .init = clusterip_net_init, .exit = clusterip_net_exit, .id = &clusterip_net_id, .size = sizeof(struct clusterip_net), }; static struct notifier_block cip_netdev_notifier = { .notifier_call = clusterip_netdev_event }; static int __init clusterip_tg_init(void) { int ret; ret = register_pernet_subsys(&clusterip_net_ops); if (ret < 0) return ret; ret = xt_register_target(&clusterip_tg_reg); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&cip_netdev_notifier); if (ret < 0) goto unregister_target; pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION); return 0; unregister_target: xt_unregister_target(&clusterip_tg_reg); cleanup_subsys: unregister_pernet_subsys(&clusterip_net_ops); return ret; } static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); unregister_netdevice_notifier(&cip_netdev_notifier); xt_unregister_target(&clusterip_tg_reg); unregister_pernet_subsys(&clusterip_net_ops); /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */ rcu_barrier(); } module_init(clusterip_tg_init); module_exit(clusterip_tg_exit);
561 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 // SPDX-License-Identifier: GPL-2.0-or-later /* * ebtables * * Author: * Bart De Schuymer <bdschuym@pandora.be> * * ebtables.c,v 2.0, July, 2002 * * This code is strongly inspired by the iptables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kmod.h> #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/smp.h> #include <linux/cpumask.h> #include <linux/audit.h> #include <net/sock.h> #include <net/netns/generic.h> /* needed for logical [in,out]-dev filtering */ #include "../br_private.h" /* Each cpu has its own set of counters, so there is no need for write_lock in * the softirq * For reading or updating the counters, the user context needs to * get a write_lock */ /* The size of each set of counters is altered to get cache alignment */ #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) #define COUNTER_OFFSET(n) (SMP_ALIGN(n * sizeof(struct ebt_counter))) #define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \ COUNTER_OFFSET(n) * cpu)) struct ebt_pernet { struct list_head tables; }; struct ebt_template { struct list_head list; char name[EBT_TABLE_MAXNAMELEN]; struct module *owner; /* called when table is needed in the given netns */ int (*table_init)(struct net *net); }; static unsigned int ebt_pernet_id __read_mostly; static LIST_HEAD(template_tables); static DEFINE_MUTEX(ebt_mutex); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void ebt_standard_compat_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; if (v >= 0) v += xt_compat_calc_jump(NFPROTO_BRIDGE, v); memcpy(dst, &v, sizeof(v)); } static int ebt_standard_compat_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; if (cv >= 0) cv -= xt_compat_calc_jump(NFPROTO_BRIDGE, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } #endif static struct xt_target ebt_standard_target = { .name = "standard", .revision = 0, .family = NFPROTO_BRIDGE, .targetsize = sizeof(int), #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = ebt_standard_compat_from_user, .compat_to_user = ebt_standard_compat_to_user, #endif }; static inline int ebt_do_watcher(const struct ebt_entry_watcher *w, struct sk_buff *skb, struct xt_action_param *par) { par->target = w->u.watcher; par->targinfo = w->data; w->u.watcher->target(skb, par); /* watchers don't give a verdict */ return 0; } static inline int ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb, struct xt_action_param *par) { par->match = m->u.match; par->matchinfo = m->data; return !m->u.match->match(skb, par); } static inline int ebt_dev_check(const char *entry, const struct net_device *device) { int i = 0; const char *devname; if (*entry == '\0') return 0; if (!device) return 1; devname = device->name; /* 1 is the wildcard token */ while (entry[i] != '\0' && entry[i] != 1 && entry[i] == devname[i]) i++; return devname[i] != entry[i] && entry[i] != 1; } /* process standard matches */ static inline int ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out) { const struct ethhdr *h = eth_hdr(skb); const struct net_bridge_port *p; __be16 ethproto; if (skb_vlan_tag_present(skb)) ethproto = htons(ETH_P_8021Q); else ethproto = h->h_proto; if (e->bitmask & EBT_802_3) { if (NF_INVF(e, EBT_IPROTO, eth_proto_is_802_3(ethproto))) return 1; } else if (!(e->bitmask & EBT_NOPROTO) && NF_INVF(e, EBT_IPROTO, e->ethproto != ethproto)) return 1; if (NF_INVF(e, EBT_IIN, ebt_dev_check(e->in, in))) return 1; if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out))) return 1; /* rcu_read_lock()ed by nf_hook_thresh */ if (in && (p = br_port_get_rcu(in)) != NULL && NF_INVF(e, EBT_ILOGICALIN, ebt_dev_check(e->logical_in, p->br->dev))) return 1; if (out && (p = br_port_get_rcu(out)) != NULL && NF_INVF(e, EBT_ILOGICALOUT, ebt_dev_check(e->logical_out, p->br->dev))) return 1; if (e->bitmask & EBT_SOURCEMAC) { if (NF_INVF(e, EBT_ISOURCE, !ether_addr_equal_masked(h->h_source, e->sourcemac, e->sourcemsk))) return 1; } if (e->bitmask & EBT_DESTMAC) { if (NF_INVF(e, EBT_IDEST, !ether_addr_equal_masked(h->h_dest, e->destmac, e->destmsk))) return 1; } return 0; } static inline struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry) { return (void *)entry + entry->next_offset; } static inline const struct ebt_entry_target * ebt_get_target_c(const struct ebt_entry *e) { return ebt_get_target((struct ebt_entry *)e); } /* Do some firewalling */ unsigned int ebt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct ebt_table *table) { unsigned int hook = state->hook; int i, nentries; struct ebt_entry *point; struct ebt_counter *counter_base, *cb_base; const struct ebt_entry_target *t; int verdict, sp = 0; struct ebt_chainstack *cs; struct ebt_entries *chaininfo; const char *base; const struct ebt_table_info *private; struct xt_action_param acpar; acpar.state = state; acpar.hotdrop = false; read_lock_bh(&table->lock); private = table->private; cb_base = COUNTER_BASE(private->counters, private->nentries, smp_processor_id()); if (private->chainstack) cs = private->chainstack[smp_processor_id()]; else cs = NULL; chaininfo = private->hook_entry[hook]; nentries = private->hook_entry[hook]->nentries; point = (struct ebt_entry *)(private->hook_entry[hook]->data); counter_base = cb_base + private->hook_entry[hook]->counter_offset; /* base for chain jumps */ base = private->entries; i = 0; while (i < nentries) { if (ebt_basic_match(point, skb, state->in, state->out)) goto letscontinue; if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0) goto letscontinue; if (acpar.hotdrop) { read_unlock_bh(&table->lock); return NF_DROP; } ADD_COUNTER(*(counter_base + i), skb->len, 1); /* these should only watch: not modify, nor tell us * what to do with the packet */ EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar); t = ebt_get_target_c(point); /* standard target */ if (!t->u.target->target) verdict = ((struct ebt_standard_target *)t)->verdict; else { acpar.target = t->u.target; acpar.targinfo = t->data; verdict = t->u.target->target(skb, &acpar); } if (verdict == EBT_ACCEPT) { read_unlock_bh(&table->lock); return NF_ACCEPT; } if (verdict == EBT_DROP) { read_unlock_bh(&table->lock); return NF_DROP; } if (verdict == EBT_RETURN) { letsreturn: if (WARN(sp == 0, "RETURN on base chain")) { /* act like this is EBT_CONTINUE */ goto letscontinue; } sp--; /* put all the local variables right */ i = cs[sp].n; chaininfo = cs[sp].chaininfo; nentries = chaininfo->nentries; point = cs[sp].e; counter_base = cb_base + chaininfo->counter_offset; continue; } if (verdict == EBT_CONTINUE) goto letscontinue; if (WARN(verdict < 0, "bogus standard verdict\n")) { read_unlock_bh(&table->lock); return NF_DROP; } /* jump to a udc */ cs[sp].n = i + 1; cs[sp].chaininfo = chaininfo; cs[sp].e = ebt_next_entry(point); i = 0; chaininfo = (struct ebt_entries *) (base + verdict); if (WARN(chaininfo->distinguisher, "jump to non-chain\n")) { read_unlock_bh(&table->lock); return NF_DROP; } nentries = chaininfo->nentries; point = (struct ebt_entry *)chaininfo->data; counter_base = cb_base + chaininfo->counter_offset; sp++; continue; letscontinue: point = ebt_next_entry(point); i++; } /* I actually like this :) */ if (chaininfo->policy == EBT_RETURN) goto letsreturn; if (chaininfo->policy == EBT_ACCEPT) { read_unlock_bh(&table->lock); return NF_ACCEPT; } read_unlock_bh(&table->lock); return NF_DROP; } /* If it succeeds, returns element and locks mutex */ static inline void * find_inlist_lock_noload(struct net *net, const char *name, int *error, struct mutex *mutex) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_template *tmpl; struct ebt_table *table; mutex_lock(mutex); list_for_each_entry(table, &ebt_net->tables, list) { if (strcmp(table->name, name) == 0) return table; } list_for_each_entry(tmpl, &template_tables, list) { if (strcmp(name, tmpl->name) == 0) { struct module *owner = tmpl->owner; if (!try_module_get(owner)) goto out; mutex_unlock(mutex); *error = tmpl->table_init(net); if (*error) { module_put(owner); return NULL; } mutex_lock(mutex); module_put(owner); break; } } list_for_each_entry(table, &ebt_net->tables, list) { if (strcmp(table->name, name) == 0) return table; } out: *error = -ENOENT; mutex_unlock(mutex); return NULL; } static void * find_inlist_lock(struct net *net, const char *name, const char *prefix, int *error, struct mutex *mutex) { return try_then_request_module( find_inlist_lock_noload(net, name, error, mutex), "%s%s", prefix, name); } static inline struct ebt_table * find_table_lock(struct net *net, const char *name, int *error, struct mutex *mutex) { return find_inlist_lock(net, name, "ebtable_", error, mutex); } static inline void ebt_free_table_info(struct ebt_table_info *info) { int i; if (info->chainstack) { for_each_possible_cpu(i) vfree(info->chainstack[i]); vfree(info->chainstack); } } static inline int ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, unsigned int *cnt) { const struct ebt_entry *e = par->entryinfo; struct xt_match *match; size_t left = ((char *)e + e->watchers_offset) - (char *)m; int ret; if (left < sizeof(struct ebt_entry_match) || left - sizeof(struct ebt_entry_match) < m->match_size) return -EINVAL; match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision); if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) { if (!IS_ERR(match)) module_put(match->me); request_module("ebt_%s", m->u.name); match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision); } if (IS_ERR(match)) return PTR_ERR(match); m->u.match = match; par->match = match; par->matchinfo = m->data; ret = xt_check_match(par, m->match_size, ntohs(e->ethproto), e->invflags & EBT_IPROTO); if (ret < 0) { module_put(match->me); return ret; } (*cnt)++; return 0; } static inline int ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par, unsigned int *cnt) { const struct ebt_entry *e = par->entryinfo; struct xt_target *watcher; size_t left = ((char *)e + e->target_offset) - (char *)w; int ret; if (left < sizeof(struct ebt_entry_watcher) || left - sizeof(struct ebt_entry_watcher) < w->watcher_size) return -EINVAL; watcher = xt_request_find_target(NFPROTO_BRIDGE, w->u.name, 0); if (IS_ERR(watcher)) return PTR_ERR(watcher); if (watcher->family != NFPROTO_BRIDGE) { module_put(watcher->me); return -ENOENT; } w->u.watcher = watcher; par->target = watcher; par->targinfo = w->data; ret = xt_check_target(par, w->watcher_size, ntohs(e->ethproto), e->invflags & EBT_IPROTO); if (ret < 0) { module_put(watcher->me); return ret; } (*cnt)++; return 0; } static int ebt_verify_pointers(const struct ebt_replace *repl, struct ebt_table_info *newinfo) { unsigned int limit = repl->entries_size; unsigned int valid_hooks = repl->valid_hooks; unsigned int offset = 0; int i; for (i = 0; i < NF_BR_NUMHOOKS; i++) newinfo->hook_entry[i] = NULL; newinfo->entries_size = repl->entries_size; newinfo->nentries = repl->nentries; while (offset < limit) { size_t left = limit - offset; struct ebt_entry *e = (void *)newinfo->entries + offset; if (left < sizeof(unsigned int)) break; for (i = 0; i < NF_BR_NUMHOOKS; i++) { if ((valid_hooks & (1 << i)) == 0) continue; if ((char __user *)repl->hook_entry[i] == repl->entries + offset) break; } if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) { if (e->bitmask != 0) { /* we make userspace set this right, * so there is no misunderstanding */ return -EINVAL; } if (i != NF_BR_NUMHOOKS) newinfo->hook_entry[i] = (struct ebt_entries *)e; if (left < sizeof(struct ebt_entries)) break; offset += sizeof(struct ebt_entries); } else { if (left < sizeof(struct ebt_entry)) break; if (left < e->next_offset) break; if (e->next_offset < sizeof(struct ebt_entry)) return -EINVAL; offset += e->next_offset; } } if (offset != limit) return -EINVAL; /* check if all valid hooks have a chain */ for (i = 0; i < NF_BR_NUMHOOKS; i++) { if (!newinfo->hook_entry[i] && (valid_hooks & (1 << i))) return -EINVAL; } return 0; } /* this one is very careful, as it is the first function * to parse the userspace data */ static inline int ebt_check_entry_size_and_hooks(const struct ebt_entry *e, const struct ebt_table_info *newinfo, unsigned int *n, unsigned int *cnt, unsigned int *totalcnt, unsigned int *udc_cnt) { int i; for (i = 0; i < NF_BR_NUMHOOKS; i++) { if ((void *)e == (void *)newinfo->hook_entry[i]) break; } /* beginning of a new chain * if i == NF_BR_NUMHOOKS it must be a user defined chain */ if (i != NF_BR_NUMHOOKS || !e->bitmask) { /* this checks if the previous chain has as many entries * as it said it has */ if (*n != *cnt) return -EINVAL; if (((struct ebt_entries *)e)->policy != EBT_DROP && ((struct ebt_entries *)e)->policy != EBT_ACCEPT) { /* only RETURN from udc */ if (i != NF_BR_NUMHOOKS || ((struct ebt_entries *)e)->policy != EBT_RETURN) return -EINVAL; } if (i == NF_BR_NUMHOOKS) /* it's a user defined chain */ (*udc_cnt)++; if (((struct ebt_entries *)e)->counter_offset != *totalcnt) return -EINVAL; *n = ((struct ebt_entries *)e)->nentries; *cnt = 0; return 0; } /* a plain old entry, heh */ if (sizeof(struct ebt_entry) > e->watchers_offset || e->watchers_offset > e->target_offset || e->target_offset >= e->next_offset) return -EINVAL; /* this is not checked anywhere else */ if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target)) return -EINVAL; (*cnt)++; (*totalcnt)++; return 0; } struct ebt_cl_stack { struct ebt_chainstack cs; int from; unsigned int hookmask; }; /* We need these positions to check that the jumps to a different part of the * entries is a jump to the beginning of a new chain. */ static inline int ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo, unsigned int *n, struct ebt_cl_stack *udc) { int i; /* we're only interested in chain starts */ if (e->bitmask) return 0; for (i = 0; i < NF_BR_NUMHOOKS; i++) { if (newinfo->hook_entry[i] == (struct ebt_entries *)e) break; } /* only care about udc */ if (i != NF_BR_NUMHOOKS) return 0; udc[*n].cs.chaininfo = (struct ebt_entries *)e; /* these initialisations are depended on later in check_chainloops() */ udc[*n].cs.n = 0; udc[*n].hookmask = 0; (*n)++; return 0; } static inline int ebt_cleanup_match(struct ebt_entry_match *m, struct net *net, unsigned int *i) { struct xt_mtdtor_param par; if (i && (*i)-- == 0) return 1; par.net = net; par.match = m->u.match; par.matchinfo = m->data; par.family = NFPROTO_BRIDGE; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); return 0; } static inline int ebt_cleanup_watcher(struct ebt_entry_watcher *w, struct net *net, unsigned int *i) { struct xt_tgdtor_param par; if (i && (*i)-- == 0) return 1; par.net = net; par.target = w->u.watcher; par.targinfo = w->data; par.family = NFPROTO_BRIDGE; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); return 0; } static inline int ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) { struct xt_tgdtor_param par; struct ebt_entry_target *t; if (e->bitmask == 0) return 0; /* we're done */ if (cnt && (*cnt)-- == 0) return 1; EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL); EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); t = ebt_get_target(e); par.net = net; par.target = t->u.target; par.targinfo = t->data; par.family = NFPROTO_BRIDGE; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); return 0; } static inline int ebt_check_entry(struct ebt_entry *e, struct net *net, const struct ebt_table_info *newinfo, const char *name, unsigned int *cnt, struct ebt_cl_stack *cl_s, unsigned int udc_cnt) { struct ebt_entry_target *t; struct xt_target *target; unsigned int i, j, hook = 0, hookmask = 0; size_t gap; int ret; struct xt_mtchk_param mtpar; struct xt_tgchk_param tgpar; /* don't mess with the struct ebt_entries */ if (e->bitmask == 0) return 0; if (e->bitmask & ~EBT_F_MASK) return -EINVAL; if (e->invflags & ~EBT_INV_MASK) return -EINVAL; if ((e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3)) return -EINVAL; /* what hook do we belong to? */ for (i = 0; i < NF_BR_NUMHOOKS; i++) { if (!newinfo->hook_entry[i]) continue; if ((char *)newinfo->hook_entry[i] < (char *)e) hook = i; else break; } /* (1 << NF_BR_NUMHOOKS) tells the check functions the rule is on * a base chain */ if (i < NF_BR_NUMHOOKS) hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS); else { for (i = 0; i < udc_cnt; i++) if ((char *)(cl_s[i].cs.chaininfo) > (char *)e) break; if (i == 0) hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS); else hookmask = cl_s[i - 1].hookmask; } i = 0; memset(&mtpar, 0, sizeof(mtpar)); memset(&tgpar, 0, sizeof(tgpar)); mtpar.net = tgpar.net = net; mtpar.table = tgpar.table = name; mtpar.entryinfo = tgpar.entryinfo = e; mtpar.hook_mask = tgpar.hook_mask = hookmask; mtpar.family = tgpar.family = NFPROTO_BRIDGE; ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i); if (ret != 0) goto cleanup_matches; j = 0; ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j); if (ret != 0) goto cleanup_watchers; t = ebt_get_target(e); gap = e->next_offset - e->target_offset; target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0); if (IS_ERR(target)) { ret = PTR_ERR(target); goto cleanup_watchers; } /* Reject UNSPEC, xtables verdicts/return values are incompatible */ if (target->family != NFPROTO_BRIDGE) { module_put(target->me); ret = -ENOENT; goto cleanup_watchers; } t->u.target = target; if (t->u.target == &ebt_standard_target) { if (gap < sizeof(struct ebt_standard_target)) { ret = -EFAULT; goto cleanup_watchers; } if (((struct ebt_standard_target *)t)->verdict < -NUM_STANDARD_TARGETS) { ret = -EFAULT; goto cleanup_watchers; } } else if (t->target_size > gap - sizeof(struct ebt_entry_target)) { module_put(t->u.target->me); ret = -EFAULT; goto cleanup_watchers; } tgpar.target = target; tgpar.targinfo = t->data; ret = xt_check_target(&tgpar, t->target_size, ntohs(e->ethproto), e->invflags & EBT_IPROTO); if (ret < 0) { module_put(target->me); goto cleanup_watchers; } (*cnt)++; return 0; cleanup_watchers: EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, &j); cleanup_matches: EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i); return ret; } /* checks for loops and sets the hook mask for udc * the hook mask for udc tells us from which base chains the udc can be * accessed. This mask is a parameter to the check() functions of the extensions */ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack *cl_s, unsigned int udc_cnt, unsigned int hooknr, char *base) { int i, chain_nr = -1, pos = 0, nentries = chain->nentries, verdict; const struct ebt_entry *e = (struct ebt_entry *)chain->data; const struct ebt_entry_target *t; while (pos < nentries || chain_nr != -1) { /* end of udc, go back one 'recursion' step */ if (pos == nentries) { /* put back values of the time when this chain was called */ e = cl_s[chain_nr].cs.e; if (cl_s[chain_nr].from != -1) nentries = cl_s[cl_s[chain_nr].from].cs.chaininfo->nentries; else nentries = chain->nentries; pos = cl_s[chain_nr].cs.n; /* make sure we won't see a loop that isn't one */ cl_s[chain_nr].cs.n = 0; chain_nr = cl_s[chain_nr].from; if (pos == nentries) continue; } t = ebt_get_target_c(e); if (strcmp(t->u.name, EBT_STANDARD_TARGET)) goto letscontinue; if (e->target_offset + sizeof(struct ebt_standard_target) > e->next_offset) return -1; verdict = ((struct ebt_standard_target *)t)->verdict; if (verdict >= 0) { /* jump to another chain */ struct ebt_entries *hlp2 = (struct ebt_entries *)(base + verdict); for (i = 0; i < udc_cnt; i++) if (hlp2 == cl_s[i].cs.chaininfo) break; /* bad destination or loop */ if (i == udc_cnt) return -1; if (cl_s[i].cs.n) return -1; if (cl_s[i].hookmask & (1 << hooknr)) goto letscontinue; /* this can't be 0, so the loop test is correct */ cl_s[i].cs.n = pos + 1; pos = 0; cl_s[i].cs.e = ebt_next_entry(e); e = (struct ebt_entry *)(hlp2->data); nentries = hlp2->nentries; cl_s[i].from = chain_nr; chain_nr = i; /* this udc is accessible from the base chain for hooknr */ cl_s[i].hookmask |= (1 << hooknr); continue; } letscontinue: e = ebt_next_entry(e); pos++; } return 0; } /* do the parsing of the table/chains/entries/matches/watchers/targets, heh */ static int translate_table(struct net *net, const char *name, struct ebt_table_info *newinfo) { unsigned int i, j, k, udc_cnt; int ret; struct ebt_cl_stack *cl_s = NULL; /* used in the checking for chain loops */ i = 0; while (i < NF_BR_NUMHOOKS && !newinfo->hook_entry[i]) i++; if (i == NF_BR_NUMHOOKS) return -EINVAL; if (newinfo->hook_entry[i] != (struct ebt_entries *)newinfo->entries) return -EINVAL; /* make sure chains are ordered after each other in same order * as their corresponding hooks */ for (j = i + 1; j < NF_BR_NUMHOOKS; j++) { if (!newinfo->hook_entry[j]) continue; if (newinfo->hook_entry[j] <= newinfo->hook_entry[i]) return -EINVAL; i = j; } /* do some early checkings and initialize some things */ i = 0; /* holds the expected nr. of entries for the chain */ j = 0; /* holds the up to now counted entries for the chain */ k = 0; /* holds the total nr. of entries, should equal * newinfo->nentries afterwards */ udc_cnt = 0; /* will hold the nr. of user defined chains (udc) */ ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_check_entry_size_and_hooks, newinfo, &i, &j, &k, &udc_cnt); if (ret != 0) return ret; if (i != j) return -EINVAL; if (k != newinfo->nentries) return -EINVAL; /* get the location of the udc, put them in an array * while we're at it, allocate the chainstack */ if (udc_cnt) { /* this will get free'd in do_replace()/ebt_register_table() * if an error occurs */ newinfo->chainstack = vmalloc(array_size(nr_cpu_ids, sizeof(*(newinfo->chainstack)))); if (!newinfo->chainstack) return -ENOMEM; for_each_possible_cpu(i) { newinfo->chainstack[i] = vmalloc_node(array_size(udc_cnt, sizeof(*(newinfo->chainstack[0]))), cpu_to_node(i)); if (!newinfo->chainstack[i]) { while (i) vfree(newinfo->chainstack[--i]); vfree(newinfo->chainstack); newinfo->chainstack = NULL; return -ENOMEM; } } cl_s = vmalloc(array_size(udc_cnt, sizeof(*cl_s))); if (!cl_s) return -ENOMEM; i = 0; /* the i'th udc */ EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_get_udc_positions, newinfo, &i, cl_s); /* sanity check */ if (i != udc_cnt) { vfree(cl_s); return -EFAULT; } } /* Check for loops */ for (i = 0; i < NF_BR_NUMHOOKS; i++) if (newinfo->hook_entry[i]) if (check_chainloops(newinfo->hook_entry[i], cl_s, udc_cnt, i, newinfo->entries)) { vfree(cl_s); return -EINVAL; } /* we now know the following (along with E=mc²): * - the nr of entries in each chain is right * - the size of the allocated space is right * - all valid hooks have a corresponding chain * - there are no loops * - wrong data can still be on the level of a single entry * - could be there are jumps to places that are not the * beginning of a chain. This can only occur in chains that * are not accessible from any base chains, so we don't care. */ /* used to know what we need to clean up if something goes wrong */ i = 0; ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt); if (ret != 0) { EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_cleanup_entry, net, &i); } vfree(cl_s); return ret; } /* called under write_lock */ static void get_counters(const struct ebt_counter *oldcounters, struct ebt_counter *counters, unsigned int nentries) { int i, cpu; struct ebt_counter *counter_base; /* counters of cpu 0 */ memcpy(counters, oldcounters, sizeof(struct ebt_counter) * nentries); /* add other counters to those of cpu 0 */ for_each_possible_cpu(cpu) { if (cpu == 0) continue; counter_base = COUNTER_BASE(oldcounters, nentries, cpu); for (i = 0; i < nentries; i++) ADD_COUNTER(counters[i], counter_base[i].bcnt, counter_base[i].pcnt); } } static int do_replace_finish(struct net *net, struct ebt_replace *repl, struct ebt_table_info *newinfo) { int ret; struct ebt_counter *counterstmp = NULL; /* used to be able to unlock earlier */ struct ebt_table_info *table; struct ebt_table *t; /* the user wants counters back * the check on the size is done later, when we have the lock */ if (repl->num_counters) { unsigned long size = repl->num_counters * sizeof(*counterstmp); counterstmp = vmalloc(size); if (!counterstmp) return -ENOMEM; } newinfo->chainstack = NULL; ret = ebt_verify_pointers(repl, newinfo); if (ret != 0) goto free_counterstmp; ret = translate_table(net, repl->name, newinfo); if (ret != 0) goto free_counterstmp; t = find_table_lock(net, repl->name, &ret, &ebt_mutex); if (!t) { ret = -ENOENT; goto free_iterate; } if (repl->valid_hooks != t->valid_hooks) { ret = -EINVAL; goto free_unlock; } if (repl->num_counters && repl->num_counters != t->private->nentries) { ret = -EINVAL; goto free_unlock; } /* we have the mutex lock, so no danger in reading this pointer */ table = t->private; /* make sure the table can only be rmmod'ed if it contains no rules */ if (!table->nentries && newinfo->nentries && !try_module_get(t->me)) { ret = -ENOENT; goto free_unlock; } else if (table->nentries && !newinfo->nentries) module_put(t->me); /* we need an atomic snapshot of the counters */ write_lock_bh(&t->lock); if (repl->num_counters) get_counters(t->private->counters, counterstmp, t->private->nentries); t->private = newinfo; write_unlock_bh(&t->lock); mutex_unlock(&ebt_mutex); /* so, a user can change the chains while having messed up her counter * allocation. Only reason why this is done is because this way the lock * is held only once, while this doesn't bring the kernel into a * dangerous state. */ if (repl->num_counters && copy_to_user(repl->counters, counterstmp, repl->num_counters * sizeof(struct ebt_counter))) { /* Silent error, can't fail, new table is already in place */ net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n"); } /* decrease module count and free resources */ EBT_ENTRY_ITERATE(table->entries, table->entries_size, ebt_cleanup_entry, net, NULL); vfree(table->entries); ebt_free_table_info(table); vfree(table); vfree(counterstmp); audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REPLACE, GFP_KERNEL); return 0; free_unlock: mutex_unlock(&ebt_mutex); free_iterate: EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_cleanup_entry, net, NULL); free_counterstmp: vfree(counterstmp); /* can be initialized in translate_table() */ ebt_free_table_info(newinfo); return ret; } /* replace the table */ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret, countersize; struct ebt_table_info *newinfo; struct ebt_replace tmp; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; if (len != sizeof(tmp) + tmp.entries_size) return -EINVAL; if (tmp.entries_size == 0) return -EINVAL; /* overflow check */ if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) / NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter)) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; tmp.name[sizeof(tmp.name) - 1] = 0; countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT); if (!newinfo) return -ENOMEM; if (countersize) memset(newinfo->counters, 0, countersize); newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT); if (!newinfo->entries) { ret = -ENOMEM; goto free_newinfo; } if (copy_from_user( newinfo->entries, tmp.entries, tmp.entries_size) != 0) { ret = -EFAULT; goto free_entries; } ret = do_replace_finish(net, &tmp, newinfo); if (ret == 0) return ret; free_entries: vfree(newinfo->entries); free_newinfo: vfree(newinfo); return ret; } static void __ebt_unregister_table(struct net *net, struct ebt_table *table) { mutex_lock(&ebt_mutex); list_del(&table->list); mutex_unlock(&ebt_mutex); audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries, AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, ebt_cleanup_entry, net, NULL); if (table->private->nentries) module_put(table->me); vfree(table->private->entries); ebt_free_table_info(table->private); vfree(table->private); kfree(table->ops); kfree(table); } int ebt_register_table(struct net *net, const struct ebt_table *input_table, const struct nf_hook_ops *template_ops) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table_info *newinfo; struct ebt_table *t, *table; struct nf_hook_ops *ops; unsigned int num_ops; struct ebt_replace_kernel *repl; int ret, i, countersize; void *p; if (input_table == NULL || (repl = input_table->table) == NULL || repl->entries == NULL || repl->entries_size == 0 || repl->counters != NULL || input_table->private != NULL) return -EINVAL; /* Don't add one table to multiple lists. */ table = kmemdup(input_table, sizeof(struct ebt_table), GFP_KERNEL); if (!table) { ret = -ENOMEM; goto out; } countersize = COUNTER_OFFSET(repl->nentries) * nr_cpu_ids; newinfo = vmalloc(sizeof(*newinfo) + countersize); ret = -ENOMEM; if (!newinfo) goto free_table; p = vmalloc(repl->entries_size); if (!p) goto free_newinfo; memcpy(p, repl->entries, repl->entries_size); newinfo->entries = p; newinfo->entries_size = repl->entries_size; newinfo->nentries = repl->nentries; if (countersize) memset(newinfo->counters, 0, countersize); /* fill in newinfo and parse the entries */ newinfo->chainstack = NULL; for (i = 0; i < NF_BR_NUMHOOKS; i++) { if ((repl->valid_hooks & (1 << i)) == 0) newinfo->hook_entry[i] = NULL; else newinfo->hook_entry[i] = p + ((char *)repl->hook_entry[i] - repl->entries); } ret = translate_table(net, repl->name, newinfo); if (ret != 0) goto free_chainstack; table->private = newinfo; rwlock_init(&table->lock); mutex_lock(&ebt_mutex); list_for_each_entry(t, &ebt_net->tables, list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; goto free_unlock; } } /* Hold a reference count if the chains aren't empty */ if (newinfo->nentries && !try_module_get(table->me)) { ret = -ENOENT; goto free_unlock; } num_ops = hweight32(table->valid_hooks); if (num_ops == 0) { ret = -EINVAL; goto free_unlock; } ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); if (!ops) { ret = -ENOMEM; if (newinfo->nentries) module_put(table->me); goto free_unlock; } for (i = 0; i < num_ops; i++) ops[i].priv = table; list_add(&table->list, &ebt_net->tables); mutex_unlock(&ebt_mutex); table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); if (ret) __ebt_unregister_table(net, table); audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REGISTER, GFP_KERNEL); return ret; free_unlock: mutex_unlock(&ebt_mutex); free_chainstack: ebt_free_table_info(newinfo); vfree(newinfo->entries); free_newinfo: vfree(newinfo); free_table: kfree(table); out: return ret; } int ebt_register_template(const struct ebt_table *t, int (*table_init)(struct net *net)) { struct ebt_template *tmpl; mutex_lock(&ebt_mutex); list_for_each_entry(tmpl, &template_tables, list) { if (WARN_ON_ONCE(strcmp(t->name, tmpl->name) == 0)) { mutex_unlock(&ebt_mutex); return -EEXIST; } } tmpl = kzalloc(sizeof(*tmpl), GFP_KERNEL); if (!tmpl) { mutex_unlock(&ebt_mutex); return -ENOMEM; } tmpl->table_init = table_init; strscpy(tmpl->name, t->name, sizeof(tmpl->name)); tmpl->owner = t->me; list_add(&tmpl->list, &template_tables); mutex_unlock(&ebt_mutex); return 0; } EXPORT_SYMBOL(ebt_register_template); void ebt_unregister_template(const struct ebt_table *t) { struct ebt_template *tmpl; mutex_lock(&ebt_mutex); list_for_each_entry(tmpl, &template_tables, list) { if (strcmp(t->name, tmpl->name)) continue; list_del(&tmpl->list); mutex_unlock(&ebt_mutex); kfree(tmpl); return; } mutex_unlock(&ebt_mutex); WARN_ON_ONCE(1); } EXPORT_SYMBOL(ebt_unregister_template); static struct ebt_table *__ebt_find_table(struct net *net, const char *name) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table *t; mutex_lock(&ebt_mutex); list_for_each_entry(t, &ebt_net->tables, list) { if (strcmp(t->name, name) == 0) { mutex_unlock(&ebt_mutex); return t; } } mutex_unlock(&ebt_mutex); return NULL; } void ebt_unregister_table_pre_exit(struct net *net, const char *name) { struct ebt_table *table = __ebt_find_table(net, name); if (table) nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(ebt_unregister_table_pre_exit); void ebt_unregister_table(struct net *net, const char *name) { struct ebt_table *table = __ebt_find_table(net, name); if (table) __ebt_unregister_table(net, table); } /* userspace just supplied us with counters */ static int do_update_counters(struct net *net, const char *name, struct ebt_counter __user *counters, unsigned int num_counters, unsigned int len) { int i, ret; struct ebt_counter *tmp; struct ebt_table *t; if (num_counters == 0) return -EINVAL; tmp = vmalloc(array_size(num_counters, sizeof(*tmp))); if (!tmp) return -ENOMEM; t = find_table_lock(net, name, &ret, &ebt_mutex); if (!t) goto free_tmp; if (num_counters != t->private->nentries) { ret = -EINVAL; goto unlock_mutex; } if (copy_from_user(tmp, counters, num_counters * sizeof(*counters))) { ret = -EFAULT; goto unlock_mutex; } /* we want an atomic add of the counters */ write_lock_bh(&t->lock); /* we add to the counters of the first cpu */ for (i = 0; i < num_counters; i++) ADD_COUNTER(t->private->counters[i], tmp[i].bcnt, tmp[i].pcnt); write_unlock_bh(&t->lock); ret = 0; unlock_mutex: mutex_unlock(&ebt_mutex); free_tmp: vfree(tmp); return ret; } static int update_counters(struct net *net, sockptr_t arg, unsigned int len) { struct ebt_replace hlp; if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter)) return -EINVAL; return do_update_counters(net, hlp.name, hlp.counters, hlp.num_counters, len); } static inline int ebt_obj_to_user(char __user *um, const char *_name, const char *data, int entrysize, int usersize, int datasize, u8 revision) { char name[EBT_EXTENSION_MAXNAMELEN] = {0}; /* ebtables expects 31 bytes long names but xt_match names are 29 bytes * long. Copy 29 bytes and fill remaining bytes with zeroes. */ strlcpy(name, _name, sizeof(name)); if (copy_to_user(um, name, EBT_EXTENSION_MAXNAMELEN) || put_user(revision, (u8 __user *)(um + EBT_EXTENSION_MAXNAMELEN)) || put_user(datasize, (int __user *)(um + EBT_EXTENSION_MAXNAMELEN + 1)) || xt_data_to_user(um + entrysize, data, usersize, datasize, XT_ALIGN(datasize))) return -EFAULT; return 0; } static inline int ebt_match_to_user(const struct ebt_entry_match *m, const char *base, char __user *ubase) { return ebt_obj_to_user(ubase + ((char *)m - base), m->u.match->name, m->data, sizeof(*m), m->u.match->usersize, m->match_size, m->u.match->revision); } static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w, const char *base, char __user *ubase) { return ebt_obj_to_user(ubase + ((char *)w - base), w->u.watcher->name, w->data, sizeof(*w), w->u.watcher->usersize, w->watcher_size, w->u.watcher->revision); } static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, char __user *ubase) { int ret; char __user *hlp; const struct ebt_entry_target *t; if (e->bitmask == 0) { /* special case !EBT_ENTRY_OR_ENTRIES */ if (copy_to_user(ubase + ((char *)e - base), e, sizeof(struct ebt_entries))) return -EFAULT; return 0; } if (copy_to_user(ubase + ((char *)e - base), e, sizeof(*e))) return -EFAULT; hlp = ubase + (((char *)e + e->target_offset) - base); t = ebt_get_target_c(e); ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase); if (ret != 0) return ret; ret = EBT_WATCHER_ITERATE(e, ebt_watcher_to_user, base, ubase); if (ret != 0) return ret; ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t), t->u.target->usersize, t->target_size, t->u.target->revision); if (ret != 0) return ret; return 0; } static int copy_counters_to_user(struct ebt_table *t, const struct ebt_counter *oldcounters, void __user *user, unsigned int num_counters, unsigned int nentries) { struct ebt_counter *counterstmp; int ret = 0; /* userspace might not need the counters */ if (num_counters == 0) return 0; if (num_counters != nentries) return -EINVAL; counterstmp = vmalloc(array_size(nentries, sizeof(*counterstmp))); if (!counterstmp) return -ENOMEM; write_lock_bh(&t->lock); get_counters(oldcounters, counterstmp, nentries); write_unlock_bh(&t->lock); if (copy_to_user(user, counterstmp, nentries * sizeof(struct ebt_counter))) ret = -EFAULT; vfree(counterstmp); return ret; } /* called with ebt_mutex locked */ static int copy_everything_to_user(struct ebt_table *t, void __user *user, const int *len, int cmd) { struct ebt_replace tmp; const struct ebt_counter *oldcounters; unsigned int entries_size, nentries; int ret; char *entries; if (cmd == EBT_SO_GET_ENTRIES) { entries_size = t->private->entries_size; nentries = t->private->nentries; entries = t->private->entries; oldcounters = t->private->counters; } else { entries_size = t->table->entries_size; nentries = t->table->nentries; entries = t->table->entries; oldcounters = t->table->counters; } if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; if (*len != sizeof(struct ebt_replace) + entries_size + (tmp.num_counters ? nentries * sizeof(struct ebt_counter) : 0)) return -EINVAL; if (tmp.nentries != nentries) return -EINVAL; if (tmp.entries_size != entries_size) return -EINVAL; ret = copy_counters_to_user(t, oldcounters, tmp.counters, tmp.num_counters, nentries); if (ret) return ret; /* set the match/watcher/target names right */ return EBT_ENTRY_ITERATE(entries, entries_size, ebt_entry_to_user, entries, tmp.entries); } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* 32 bit-userspace compatibility definitions. */ struct compat_ebt_replace { char name[EBT_TABLE_MAXNAMELEN]; compat_uint_t valid_hooks; compat_uint_t nentries; compat_uint_t entries_size; /* start of the chains */ compat_uptr_t hook_entry[NF_BR_NUMHOOKS]; /* nr of counters userspace expects back */ compat_uint_t num_counters; /* where the kernel will put the old counters. */ compat_uptr_t counters; compat_uptr_t entries; }; /* struct ebt_entry_match, _target and _watcher have same layout */ struct compat_ebt_entry_mwt { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; u8 revision; }; compat_uptr_t ptr; } u; compat_uint_t match_size; compat_uint_t data[] __aligned(__alignof__(struct compat_ebt_replace)); }; /* account for possible padding between match_size and ->data */ static int ebt_compat_entry_padsize(void) { BUILD_BUG_ON(sizeof(struct ebt_entry_match) < sizeof(struct compat_ebt_entry_mwt)); return (int) sizeof(struct ebt_entry_match) - sizeof(struct compat_ebt_entry_mwt); } static int ebt_compat_match_offset(const struct xt_match *match, unsigned int userlen) { /* ebt_among needs special handling. The kernel .matchsize is * set to -1 at registration time; at runtime an EBT_ALIGN()ed * value is expected. * Example: userspace sends 4500, ebt_among.c wants 4504. */ if (unlikely(match->matchsize == -1)) return XT_ALIGN(userlen) - COMPAT_XT_ALIGN(userlen); return xt_compat_match_offset(match); } static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr, unsigned int *size) { const struct xt_match *match = m->u.match; struct compat_ebt_entry_mwt __user *cm = *dstptr; int off = ebt_compat_match_offset(match, m->match_size); compat_uint_t msize = m->match_size - off; if (WARN_ON(off >= m->match_size)) return -EINVAL; if (copy_to_user(cm->u.name, match->name, strlen(match->name) + 1) || put_user(match->revision, &cm->u.revision) || put_user(msize, &cm->match_size)) return -EFAULT; if (match->compat_to_user) { if (match->compat_to_user(cm->data, m->data)) return -EFAULT; } else { if (xt_data_to_user(cm->data, m->data, match->usersize, msize, COMPAT_XT_ALIGN(msize))) return -EFAULT; } *size -= ebt_compat_entry_padsize() + off; *dstptr = cm->data; *dstptr += msize; return 0; } static int compat_target_to_user(struct ebt_entry_target *t, void __user **dstptr, unsigned int *size) { const struct xt_target *target = t->u.target; struct compat_ebt_entry_mwt __user *cm = *dstptr; int off = xt_compat_target_offset(target); compat_uint_t tsize = t->target_size - off; if (WARN_ON(off >= t->target_size)) return -EINVAL; if (copy_to_user(cm->u.name, target->name, strlen(target->name) + 1) || put_user(target->revision, &cm->u.revision) || put_user(tsize, &cm->match_size)) return -EFAULT; if (target->compat_to_user) { if (target->compat_to_user(cm->data, t->data)) return -EFAULT; } else { if (xt_data_to_user(cm->data, t->data, target->usersize, tsize, COMPAT_XT_ALIGN(tsize))) return -EFAULT; } *size -= ebt_compat_entry_padsize() + off; *dstptr = cm->data; *dstptr += tsize; return 0; } static int compat_watcher_to_user(struct ebt_entry_watcher *w, void __user **dstptr, unsigned int *size) { return compat_target_to_user((struct ebt_entry_target *)w, dstptr, size); } static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr, unsigned int *size) { struct ebt_entry_target *t; struct ebt_entry __user *ce; u32 watchers_offset, target_offset, next_offset; compat_uint_t origsize; int ret; if (e->bitmask == 0) { if (*size < sizeof(struct ebt_entries)) return -EINVAL; if (copy_to_user(*dstptr, e, sizeof(struct ebt_entries))) return -EFAULT; *dstptr += sizeof(struct ebt_entries); *size -= sizeof(struct ebt_entries); return 0; } if (*size < sizeof(*ce)) return -EINVAL; ce = *dstptr; if (copy_to_user(ce, e, sizeof(*ce))) return -EFAULT; origsize = *size; *dstptr += sizeof(*ce); ret = EBT_MATCH_ITERATE(e, compat_match_to_user, dstptr, size); if (ret) return ret; watchers_offset = e->watchers_offset - (origsize - *size); ret = EBT_WATCHER_ITERATE(e, compat_watcher_to_user, dstptr, size); if (ret) return ret; target_offset = e->target_offset - (origsize - *size); t = ebt_get_target(e); ret = compat_target_to_user(t, dstptr, size); if (ret) return ret; next_offset = e->next_offset - (origsize - *size); if (put_user(watchers_offset, &ce->watchers_offset) || put_user(target_offset, &ce->target_offset) || put_user(next_offset, &ce->next_offset)) return -EFAULT; *size -= sizeof(*ce); return 0; } static int compat_calc_match(struct ebt_entry_match *m, int *off) { *off += ebt_compat_match_offset(m->u.match, m->match_size); *off += ebt_compat_entry_padsize(); return 0; } static int compat_calc_watcher(struct ebt_entry_watcher *w, int *off) { *off += xt_compat_target_offset(w->u.watcher); *off += ebt_compat_entry_padsize(); return 0; } static int compat_calc_entry(const struct ebt_entry *e, const struct ebt_table_info *info, const void *base, struct compat_ebt_replace *newinfo) { const struct ebt_entry_target *t; unsigned int entry_offset; int off, ret, i; if (e->bitmask == 0) return 0; off = 0; entry_offset = (void *)e - base; EBT_MATCH_ITERATE(e, compat_calc_match, &off); EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off); t = ebt_get_target_c(e); off += xt_compat_target_offset(t->u.target); off += ebt_compat_entry_padsize(); newinfo->entries_size -= off; ret = xt_compat_add_offset(NFPROTO_BRIDGE, entry_offset, off); if (ret) return ret; for (i = 0; i < NF_BR_NUMHOOKS; i++) { const void *hookptr = info->hook_entry[i]; if (info->hook_entry[i] && (e < (struct ebt_entry *)(base - hookptr))) { newinfo->hook_entry[i] -= off; pr_debug("0x%08X -> 0x%08X\n", newinfo->hook_entry[i] + off, newinfo->hook_entry[i]); } } return 0; } static int ebt_compat_init_offsets(unsigned int number) { if (number > INT_MAX) return -EINVAL; /* also count the base chain policies */ number += NF_BR_NUMHOOKS; return xt_compat_init_offsets(NFPROTO_BRIDGE, number); } static int compat_table_info(const struct ebt_table_info *info, struct compat_ebt_replace *newinfo) { unsigned int size = info->entries_size; const void *entries = info->entries; int ret; newinfo->entries_size = size; ret = ebt_compat_init_offsets(info->nentries); if (ret) return ret; return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); } static int compat_copy_everything_to_user(struct ebt_table *t, void __user *user, int *len, int cmd) { struct compat_ebt_replace repl, tmp; struct ebt_counter *oldcounters; struct ebt_table_info tinfo; int ret; void __user *pos; memset(&tinfo, 0, sizeof(tinfo)); if (cmd == EBT_SO_GET_ENTRIES) { tinfo.entries_size = t->private->entries_size; tinfo.nentries = t->private->nentries; tinfo.entries = t->private->entries; oldcounters = t->private->counters; } else { tinfo.entries_size = t->table->entries_size; tinfo.nentries = t->table->nentries; tinfo.entries = t->table->entries; oldcounters = t->table->counters; } if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; if (tmp.nentries != tinfo.nentries || (tmp.num_counters && tmp.num_counters != tinfo.nentries)) return -EINVAL; memcpy(&repl, &tmp, sizeof(repl)); if (cmd == EBT_SO_GET_ENTRIES) ret = compat_table_info(t->private, &repl); else ret = compat_table_info(&tinfo, &repl); if (ret) return ret; if (*len != sizeof(tmp) + repl.entries_size + (tmp.num_counters? tinfo.nentries * sizeof(struct ebt_counter): 0)) { pr_err("wrong size: *len %d, entries_size %u, replsz %d\n", *len, tinfo.entries_size, repl.entries_size); return -EINVAL; } /* userspace might not need the counters */ ret = copy_counters_to_user(t, oldcounters, compat_ptr(tmp.counters), tmp.num_counters, tinfo.nentries); if (ret) return ret; pos = compat_ptr(tmp.entries); return EBT_ENTRY_ITERATE(tinfo.entries, tinfo.entries_size, compat_copy_entry_to_user, &pos, &tmp.entries_size); } struct ebt_entries_buf_state { char *buf_kern_start; /* kernel buffer to copy (translated) data to */ u32 buf_kern_len; /* total size of kernel buffer */ u32 buf_kern_offset; /* amount of data copied so far */ u32 buf_user_offset; /* read position in userspace buffer */ }; static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz) { state->buf_kern_offset += sz; return state->buf_kern_offset >= sz ? 0 : -EINVAL; } static int ebt_buf_add(struct ebt_entries_buf_state *state, const void *data, unsigned int sz) { if (state->buf_kern_start == NULL) goto count_only; if (WARN_ON(state->buf_kern_offset + sz > state->buf_kern_len)) return -EINVAL; memcpy(state->buf_kern_start + state->buf_kern_offset, data, sz); count_only: state->buf_user_offset += sz; return ebt_buf_count(state, sz); } static int ebt_buf_add_pad(struct ebt_entries_buf_state *state, unsigned int sz) { char *b = state->buf_kern_start; if (WARN_ON(b && state->buf_kern_offset > state->buf_kern_len)) return -EINVAL; if (b != NULL && sz > 0) memset(b + state->buf_kern_offset, 0, sz); /* do not adjust ->buf_user_offset here, we added kernel-side padding */ return ebt_buf_count(state, sz); } enum compat_mwt { EBT_COMPAT_MATCH, EBT_COMPAT_WATCHER, EBT_COMPAT_TARGET, }; static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, enum compat_mwt compat_mwt, struct ebt_entries_buf_state *state, const unsigned char *base) { char name[EBT_EXTENSION_MAXNAMELEN]; struct xt_match *match; struct xt_target *wt; void *dst = NULL; int off, pad = 0; unsigned int size_kern, match_size = mwt->match_size; if (strscpy(name, mwt->u.name, sizeof(name)) < 0) return -EINVAL; if (state->buf_kern_start) dst = state->buf_kern_start + state->buf_kern_offset; switch (compat_mwt) { case EBT_COMPAT_MATCH: match = xt_request_find_match(NFPROTO_BRIDGE, name, mwt->u.revision); if (IS_ERR(match)) return PTR_ERR(match); off = ebt_compat_match_offset(match, match_size); if (dst) { if (match->compat_from_user) match->compat_from_user(dst, mwt->data); else memcpy(dst, mwt->data, match_size); } size_kern = match->matchsize; if (unlikely(size_kern == -1)) size_kern = match_size; module_put(match->me); break; case EBT_COMPAT_WATCHER: case EBT_COMPAT_TARGET: wt = xt_request_find_target(NFPROTO_BRIDGE, name, mwt->u.revision); if (IS_ERR(wt)) return PTR_ERR(wt); off = xt_compat_target_offset(wt); if (dst) { if (wt->compat_from_user) wt->compat_from_user(dst, mwt->data); else memcpy(dst, mwt->data, match_size); } size_kern = wt->targetsize; module_put(wt->me); break; default: return -EINVAL; } state->buf_kern_offset += match_size + off; state->buf_user_offset += match_size; pad = XT_ALIGN(size_kern) - size_kern; if (pad > 0 && dst) { if (WARN_ON(state->buf_kern_len <= pad)) return -EINVAL; if (WARN_ON(state->buf_kern_offset - (match_size + off) + size_kern > state->buf_kern_len - pad)) return -EINVAL; memset(dst + size_kern, 0, pad); } return off + match_size; } /* return size of all matches, watchers or target, including necessary * alignment and padding. */ static int ebt_size_mwt(const struct compat_ebt_entry_mwt *match32, unsigned int size_left, enum compat_mwt type, struct ebt_entries_buf_state *state, const void *base) { const char *buf = (const char *)match32; int growth = 0; if (size_left == 0) return 0; do { struct ebt_entry_match *match_kern; int ret; if (size_left < sizeof(*match32)) return -EINVAL; match_kern = (struct ebt_entry_match *) state->buf_kern_start; if (match_kern) { char *tmp; tmp = state->buf_kern_start + state->buf_kern_offset; match_kern = (struct ebt_entry_match *) tmp; } ret = ebt_buf_add(state, buf, sizeof(*match32)); if (ret < 0) return ret; size_left -= sizeof(*match32); /* add padding before match->data (if any) */ ret = ebt_buf_add_pad(state, ebt_compat_entry_padsize()); if (ret < 0) return ret; if (match32->match_size > size_left) return -EINVAL; size_left -= match32->match_size; ret = compat_mtw_from_user(match32, type, state, base); if (ret < 0) return ret; if (WARN_ON(ret < match32->match_size)) return -EINVAL; growth += ret - match32->match_size; growth += ebt_compat_entry_padsize(); buf += sizeof(*match32); buf += match32->match_size; if (match_kern) match_kern->match_size = ret; match32 = (struct compat_ebt_entry_mwt *) buf; } while (size_left); return growth; } /* called for all ebt_entry structures. */ static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *base, unsigned int *total, struct ebt_entries_buf_state *state) { unsigned int i, j, startoff, next_expected_off, new_offset = 0; /* stores match/watchers/targets & offset of next struct ebt_entry: */ unsigned int offsets[4]; unsigned int *offsets_update = NULL; int ret; char *buf_start; if (*total < sizeof(struct ebt_entries)) return -EINVAL; if (!entry->bitmask) { *total -= sizeof(struct ebt_entries); return ebt_buf_add(state, entry, sizeof(struct ebt_entries)); } if (*total < sizeof(*entry) || entry->next_offset < sizeof(*entry)) return -EINVAL; startoff = state->buf_user_offset; /* pull in most part of ebt_entry, it does not need to be changed. */ ret = ebt_buf_add(state, entry, offsetof(struct ebt_entry, watchers_offset)); if (ret < 0) return ret; offsets[0] = sizeof(struct ebt_entry); /* matches come first */ memcpy(&offsets[1], &entry->watchers_offset, sizeof(offsets) - sizeof(offsets[0])); if (state->buf_kern_start) { buf_start = state->buf_kern_start + state->buf_kern_offset; offsets_update = (unsigned int *) buf_start; } ret = ebt_buf_add(state, &offsets[1], sizeof(offsets) - sizeof(offsets[0])); if (ret < 0) return ret; buf_start = (char *) entry; /* 0: matches offset, always follows ebt_entry. * 1: watchers offset, from ebt_entry structure * 2: target offset, from ebt_entry structure * 3: next ebt_entry offset, from ebt_entry structure * * offsets are relative to beginning of struct ebt_entry (i.e., 0). */ for (i = 0; i < 4 ; ++i) { if (offsets[i] > *total) return -EINVAL; if (i < 3 && offsets[i] == *total) return -EINVAL; if (i == 0) continue; if (offsets[i-1] > offsets[i]) return -EINVAL; } for (i = 0, j = 1 ; j < 4 ; j++, i++) { struct compat_ebt_entry_mwt *match32; unsigned int size; char *buf = buf_start + offsets[i]; if (offsets[i] > offsets[j]) return -EINVAL; match32 = (struct compat_ebt_entry_mwt *) buf; size = offsets[j] - offsets[i]; ret = ebt_size_mwt(match32, size, i, state, base); if (ret < 0) return ret; new_offset += ret; if (offsets_update && new_offset) { pr_debug("change offset %d to %d\n", offsets_update[i], offsets[j] + new_offset); offsets_update[i] = offsets[j] + new_offset; } } if (state->buf_kern_start == NULL) { unsigned int offset = buf_start - (char *) base; ret = xt_compat_add_offset(NFPROTO_BRIDGE, offset, new_offset); if (ret < 0) return ret; } next_expected_off = state->buf_user_offset - startoff; if (next_expected_off != entry->next_offset) return -EINVAL; if (*total < entry->next_offset) return -EINVAL; *total -= entry->next_offset; return 0; } /* repl->entries_size is the size of the ebt_entry blob in userspace. * It might need more memory when copied to a 64 bit kernel in case * userspace is 32-bit. So, first task: find out how much memory is needed. * * Called before validation is performed. */ static int compat_copy_entries(unsigned char *data, unsigned int size_user, struct ebt_entries_buf_state *state) { unsigned int size_remaining = size_user; int ret; ret = EBT_ENTRY_ITERATE(data, size_user, size_entry_mwt, data, &size_remaining, state); if (ret < 0) return ret; if (size_remaining) return -EINVAL; return state->buf_kern_offset; } static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, sockptr_t arg, unsigned int len) { struct compat_ebt_replace tmp; int i; if (len < sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp))) return -EFAULT; if (len != sizeof(tmp) + tmp.entries_size) return -EINVAL; if (tmp.entries_size == 0) return -EINVAL; if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) / NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter)) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry)); /* starting with hook_entry, 32 vs. 64 bit structures are different */ for (i = 0; i < NF_BR_NUMHOOKS; i++) repl->hook_entry[i] = compat_ptr(tmp.hook_entry[i]); repl->num_counters = tmp.num_counters; repl->counters = compat_ptr(tmp.counters); repl->entries = compat_ptr(tmp.entries); return 0; } static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret, i, countersize, size64; struct ebt_table_info *newinfo; struct ebt_replace tmp; struct ebt_entries_buf_state state; void *entries_tmp; ret = compat_copy_ebt_replace_from_user(&tmp, arg, len); if (ret) { /* try real handler in case userland supplied needed padding */ if (ret == -EINVAL && do_replace(net, arg, len) == 0) ret = 0; return ret; } countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; newinfo = vmalloc(sizeof(*newinfo) + countersize); if (!newinfo) return -ENOMEM; if (countersize) memset(newinfo->counters, 0, countersize); memset(&state, 0, sizeof(state)); newinfo->entries = vmalloc(tmp.entries_size); if (!newinfo->entries) { ret = -ENOMEM; goto free_newinfo; } if (copy_from_user( newinfo->entries, tmp.entries, tmp.entries_size) != 0) { ret = -EFAULT; goto free_entries; } entries_tmp = newinfo->entries; xt_compat_lock(NFPROTO_BRIDGE); ret = ebt_compat_init_offsets(tmp.nentries); if (ret < 0) goto out_unlock; ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); if (ret < 0) goto out_unlock; pr_debug("tmp.entries_size %d, kern off %d, user off %d delta %d\n", tmp.entries_size, state.buf_kern_offset, state.buf_user_offset, xt_compat_calc_jump(NFPROTO_BRIDGE, tmp.entries_size)); size64 = ret; newinfo->entries = vmalloc(size64); if (!newinfo->entries) { vfree(entries_tmp); ret = -ENOMEM; goto out_unlock; } memset(&state, 0, sizeof(state)); state.buf_kern_start = newinfo->entries; state.buf_kern_len = size64; ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); if (WARN_ON(ret < 0)) { vfree(entries_tmp); goto out_unlock; } vfree(entries_tmp); tmp.entries_size = size64; for (i = 0; i < NF_BR_NUMHOOKS; i++) { char __user *usrptr; if (tmp.hook_entry[i]) { unsigned int delta; usrptr = (char __user *) tmp.hook_entry[i]; delta = usrptr - tmp.entries; usrptr += xt_compat_calc_jump(NFPROTO_BRIDGE, delta); tmp.hook_entry[i] = (struct ebt_entries __user *)usrptr; } } xt_compat_flush_offsets(NFPROTO_BRIDGE); xt_compat_unlock(NFPROTO_BRIDGE); ret = do_replace_finish(net, &tmp, newinfo); if (ret == 0) return ret; free_entries: vfree(newinfo->entries); free_newinfo: vfree(newinfo); return ret; out_unlock: xt_compat_flush_offsets(NFPROTO_BRIDGE); xt_compat_unlock(NFPROTO_BRIDGE); goto free_entries; } static int compat_update_counters(struct net *net, sockptr_t arg, unsigned int len) { struct compat_ebt_replace hlp; if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; /* try real handler in case userland supplied needed padding */ if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter)) return update_counters(net, arg, len); return do_update_counters(net, hlp.name, compat_ptr(hlp.counters), hlp.num_counters, len); } static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; struct compat_ebt_replace tmp; struct ebt_table *t; struct net *net = sock_net(sk); if ((cmd == EBT_SO_GET_INFO || cmd == EBT_SO_GET_INIT_INFO) && *len != sizeof(struct compat_ebt_replace)) return -EINVAL; if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; tmp.name[sizeof(tmp.name) - 1] = '\0'; t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); if (!t) return ret; xt_compat_lock(NFPROTO_BRIDGE); switch (cmd) { case EBT_SO_GET_INFO: tmp.nentries = t->private->nentries; ret = compat_table_info(t->private, &tmp); if (ret) goto out; tmp.valid_hooks = t->valid_hooks; if (copy_to_user(user, &tmp, *len) != 0) { ret = -EFAULT; break; } ret = 0; break; case EBT_SO_GET_INIT_INFO: tmp.nentries = t->table->nentries; tmp.entries_size = t->table->entries_size; tmp.valid_hooks = t->table->valid_hooks; if (copy_to_user(user, &tmp, *len) != 0) { ret = -EFAULT; break; } ret = 0; break; case EBT_SO_GET_ENTRIES: case EBT_SO_GET_INIT_ENTRIES: /* try real handler first in case of userland-side padding. * in case we are dealing with an 'ordinary' 32 bit binary * without 64bit compatibility padding, this will fail right * after copy_from_user when the *len argument is validated. * * the compat_ variant needs to do one pass over the kernel * data set to adjust for size differences before it the check. */ if (copy_everything_to_user(t, user, len, cmd) == 0) ret = 0; else ret = compat_copy_everything_to_user(t, user, len, cmd); break; default: ret = -EINVAL; } out: xt_compat_flush_offsets(NFPROTO_BRIDGE); xt_compat_unlock(NFPROTO_BRIDGE); mutex_unlock(&ebt_mutex); return ret; } #endif static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct net *net = sock_net(sk); struct ebt_replace tmp; struct ebt_table *t; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* try real handler in case userland supplied needed padding */ if (in_compat_syscall() && ((cmd != EBT_SO_GET_INFO && cmd != EBT_SO_GET_INIT_INFO) || *len != sizeof(tmp))) return compat_do_ebt_get_ctl(sk, cmd, user, len); #endif if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; tmp.name[sizeof(tmp.name) - 1] = '\0'; t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); if (!t) return ret; switch (cmd) { case EBT_SO_GET_INFO: case EBT_SO_GET_INIT_INFO: if (*len != sizeof(struct ebt_replace)) { ret = -EINVAL; mutex_unlock(&ebt_mutex); break; } if (cmd == EBT_SO_GET_INFO) { tmp.nentries = t->private->nentries; tmp.entries_size = t->private->entries_size; tmp.valid_hooks = t->valid_hooks; } else { tmp.nentries = t->table->nentries; tmp.entries_size = t->table->entries_size; tmp.valid_hooks = t->table->valid_hooks; } mutex_unlock(&ebt_mutex); if (copy_to_user(user, &tmp, *len) != 0) { ret = -EFAULT; break; } ret = 0; break; case EBT_SO_GET_ENTRIES: case EBT_SO_GET_INIT_ENTRIES: ret = copy_everything_to_user(t, user, len, cmd); mutex_unlock(&ebt_mutex); break; default: mutex_unlock(&ebt_mutex); ret = -EINVAL; } return ret; } static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { struct net *net = sock_net(sk); int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case EBT_SO_SET_ENTRIES: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(net, arg, len); else #endif ret = do_replace(net, arg, len); break; case EBT_SO_SET_COUNTERS: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_update_counters(net, arg, len); else #endif ret = update_counters(net, arg, len); break; default: ret = -EINVAL; } return ret; } static struct nf_sockopt_ops ebt_sockopts = { .pf = PF_INET, .set_optmin = EBT_BASE_CTL, .set_optmax = EBT_SO_SET_MAX + 1, .set = do_ebt_set_ctl, .get_optmin = EBT_BASE_CTL, .get_optmax = EBT_SO_GET_MAX + 1, .get = do_ebt_get_ctl, .owner = THIS_MODULE, }; static int __net_init ebt_pernet_init(struct net *net) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); INIT_LIST_HEAD(&ebt_net->tables); return 0; } static struct pernet_operations ebt_net_ops = { .init = ebt_pernet_init, .id = &ebt_pernet_id, .size = sizeof(struct ebt_pernet), }; static int __init ebtables_init(void) { int ret; ret = xt_register_target(&ebt_standard_target); if (ret < 0) return ret; ret = nf_register_sockopt(&ebt_sockopts); if (ret < 0) { xt_unregister_target(&ebt_standard_target); return ret; } ret = register_pernet_subsys(&ebt_net_ops); if (ret < 0) { nf_unregister_sockopt(&ebt_sockopts); xt_unregister_target(&ebt_standard_target); return ret; } return 0; } static void ebtables_fini(void) { nf_unregister_sockopt(&ebt_sockopts); xt_unregister_target(&ebt_standard_target); unregister_pernet_subsys(&ebt_net_ops); } EXPORT_SYMBOL(ebt_register_table); EXPORT_SYMBOL(ebt_unregister_table); EXPORT_SYMBOL(ebt_do_table); module_init(ebtables_init); module_exit(ebtables_fini); MODULE_LICENSE("GPL");
564 2 1 1 564 564 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 802 801 801 184 184 184 1085 1084 1056 795 17 167 564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 // SPDX-License-Identifier: GPL-2.0-only /* * net/core/fib_rules.c Generic Routing Rules * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> #include <linux/indirect_call_wrapper.h> #if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) #ifdef CONFIG_IP_MULTIPLE_TABLES #define INDIRECT_CALL_MT(f, f2, f1, ...) \ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #elif defined(CONFIG_IP_MULTIPLE_TABLES) #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) #endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), KUIDT_INIT(~0), }; bool fib_rule_matchall(const struct fib_rule *rule) { if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; if (fib_rule_port_range_set(&rule->sport_range)) return false; if (fib_rule_port_range_set(&rule->dport_range)) return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) { struct fib_rule *r; r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; r->flags = flags; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; /* The lock is not required here, the list in unreacheable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; } EXPORT_SYMBOL(fib_default_rule_add); static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; if (!list_empty(&ops->rules_list)) { pos = ops->rules_list.next; if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; } } return 0; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) { struct fib_rules_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (ops->family == family) { if (!try_module_get(ops->owner)) ops = NULL; rcu_read_unlock(); return ops; } } rcu_read_unlock(); return NULL; } static void rules_ops_put(struct fib_rules_ops *ops) { if (ops) module_put(ops->owner); } static void flush_route_cache(struct fib_rules_ops *ops) { if (ops->flush_cache) ops->flush_cache(ops); } static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; struct net *net; net = ops->fro_net; if (ops->rule_size < sizeof(struct fib_rule)) return -EINVAL; if (ops->match == NULL || ops->configure == NULL || ops->compare == NULL || ops->fill == NULL || ops->action == NULL) return -EINVAL; spin_lock(&net->rules_mod_lock); list_for_each_entry(o, &net->rules_ops, list) if (ops->family == o->family) goto errout; list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: spin_unlock(&net->rules_mod_lock); return err; } struct fib_rules_ops * fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) { struct fib_rules_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&ops->rules_list); ops->fro_net = net; err = __fib_rules_register(ops); if (err) { kfree(ops); ops = ERR_PTR(err); } return ops; } EXPORT_SYMBOL_GPL(fib_rules_register); static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); if (ops->delete) ops->delete(rule); fib_rule_put(rule); } } void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); spin_unlock(&net->rules_mod_lock); fib_rules_cleanup_ops(ops); kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); static int uid_range_set(struct fib_kuid_range *range) { return uid_valid(range->start) && uid_valid(range->end); } static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) { struct fib_rule_uid_range *in; struct fib_kuid_range out; in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); out.start = make_kuid(current_user_ns(), in->start); out.end = make_kuid(current_user_ns(), in->end); return out; } static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) { struct fib_rule_uid_range out = { from_kuid_munged(current_user_ns(), range->start), from_kuid_munged(current_user_ns(), range->end) }; return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } static int nla_get_port_range(struct nlattr *pattr, struct fib_rule_port_range *port_range) { const struct fib_rule_port_range *pr = nla_data(pattr); if (!fib_rule_port_range_valid(pr)) return -EINVAL; port_range->start = pr->start; port_range->end = pr->end; return 0; } static int nla_put_port_range(struct sk_buff *skb, int attrtype, struct fib_rule_port_range *range) { return nla_put(skb, attrtype, sizeof(*range), range); } static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { int ret = 0; if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) goto out; if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; if (uid_lt(fl->flowi_uid, rule->uid_range.start) || uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; ret = INDIRECT_CALL_MT(ops->match, fib6_rule_match, fib4_rule_match, rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { struct fib_rule *rule; int err; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { struct fib_rule *target; target = rcu_dereference(rule->ctarget); if (target == NULL) { continue; } else { rule = target; goto jumped; } } else if (rule->action == FR_ACT_NOP) continue; else err = INDIRECT_CALL_MT(ops->action, fib6_rule_action, fib4_rule_action, rule, fl, flags, arg); if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, rule, flags, arg)) continue; if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } break; } } err = -ESRCH; out: rcu_read_unlock(); return err; } EXPORT_SYMBOL_GPL(fib_rules_lookup); static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_rule *rule, int family, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, .info.extack = extack, .rule = rule, }; return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, struct fib_rules_ops *ops, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, .info.extack = extack, .rule = rule, }; ops->fib_rules_seq++; return call_fib_notifiers(net, event_type, &info.info); } /* Called with rcu_read_lock() */ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family, extack); if (err) break; } rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); unsigned int fib_rules_seq_read(struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; ASSERT_RTNL(); ops = lookup_rules_ops(net, family); if (!ops) return 0; fib_rules_seq = ops->fib_rules_seq; rules_ops_put(ops); return fib_rules_seq; } EXPORT_SYMBOL_GPL(fib_rules_seq_read); static struct fib_rule *rule_find(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule, bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (rule->action && r->action != rule->action) continue; if (rule->table && r->table != rule->table) continue; if (user_priority && r->pref != rule->pref) continue; if (rule->iifname[0] && memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (rule->oifname[0] && memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (rule->mark && r->mark != rule->mark) continue; if (rule->suppress_ifgroup != -1 && r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (rule->suppress_prefixlen != -1 && r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; if (uid_range_set(&rule->uid_range) && (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; if (rule->proto && r->proto != rule->proto) continue; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return r; } return NULL; } #ifdef CONFIG_NET_L3_MASTER_DEV static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { nlrule->l3mdev = nla_get_u8(nla); if (nlrule->l3mdev != 1) { NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); return -1; } return 0; } #else static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); return -1; } #endif static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; if (frh->src_len) if (!tb[FRA_SRC] || frh->src_len > (ops->addr_size * 8) || nla_len(tb[FRA_SRC]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid source address"); goto errout; } if (frh->dst_len) if (!tb[FRA_DST] || frh->dst_len > (ops->addr_size * 8) || nla_len(tb[FRA_DST]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid dst address"); goto errout; } nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; } refcount_set(&nlrule->refcnt, 1); nlrule->fr_net = net; if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; } else { nlrule->pref = fib_default_rule_pref(ops); } nlrule->proto = tb[FRA_PROTOCOL] ? nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; if (tb[FRA_IIFNAME]) { struct net_device *dev; nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { struct net_device *dev; nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); err = -EINVAL; if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; nlrule->action = frh->action; nlrule->flags = frh->flags; nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { if (nlrule->action != FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; } nlrule->target = nla_get_u32(tb[FRA_GOTO]); /* Backward jumps are prohibited to avoid endless loops */ if (nlrule->target <= nlrule->pref) { NL_SET_ERR_MSG(extack, "Backward goto not supported"); goto errout_free; } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; } if (nlrule->l3mdev && nlrule->table) { NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } nlrule->uid_range = nla_get_kuid_range(tb); if (!uid_range_set(&nlrule->uid_range) || !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; } } else { nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], &nlrule->sport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], &nlrule->dport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } } *rule = nlrule; return 0; errout_free: kfree(nlrule); errout: return err; } static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (r->action != rule->action) continue; if (r->table != rule->table) continue; if (r->pref != rule->pref) continue; if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (r->mark != rule->mark) continue; if (r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (r->mark_mask != rule->mark_mask) continue; if (r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (r->l3mdev != rule->l3mdev) continue; if (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; if (r->ip_proto != rule->ip_proto) continue; if (r->proto != rule->proto) continue; if (!fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return 1; } return 0; } int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *last = NULL; struct nlattr *tb[FRA_MAX + 1]; int err = -EINVAL, unresolved = 0; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (!ops) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); if (err < 0) goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { RCU_INIT_POINTER(rule->ctarget, r); break; } } if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; } if (last) list_add_rcu(&rule->list, &last->list); else list_add_rcu(&rule->list, &ops->rules_list); if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if * any of them are pointing to this new rule. */ list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref && rtnl_dereference(r->ctarget) == NULL) { rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; } } } if (rule->action == FR_ACT_GOTO) ops->nr_goto_rules++; if (unresolved) ops->unresolved_rules++; if (rule->tun_id) ip_tunnel_need_metadata(); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: kfree(rule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_newrule); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; int err = -EINVAL; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; goto errout; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; goto errout; } if (ops->delete) { err = ops->delete(rule); if (err) goto errout; } if (rule->tun_id) ip_tunnel_unneed_metadata(); list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { ops->nr_goto_rules--; if (rtnl_dereference(rule->ctarget) == NULL) ops->unresolved_rules--; } /* * Check if this rule is a target to any of them. If so, * adjust to the next one with the same preference or * disable them. As this operation is eventually very * expensive, it is only performed if goto rules, except * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { struct fib_rule *n; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) n = NULL; list_for_each_entry(r, &ops->rules_list, list) { if (rtnl_dereference(r->ctarget) != rule) continue; rcu_assign_pointer(r->ctarget, n); if (!n) ops->unresolved_rules++; } } call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; errout: kfree(nlrule); rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_delrule); static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); return payload; } static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, u32 pid, u32 seq, int type, int flags, struct fib_rules_ops *ops) { struct nlmsghdr *nlh; struct fib_rule_hdr *frh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); if (nlh == NULL) return -EMSGSIZE; frh = nlmsg_data(nlh); frh->family = ops->family; frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; if (rule->iifindex == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; if (rule->oifindex == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } if ((rule->pref && nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || (rule->mark && nla_put_u32(skb, FRA_FWMARK, rule->mark)) || ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) goto nla_put_failure; } if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, struct fib_rules_ops *ops) { int idx = 0; struct fib_rule *rule; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops); if (err) break; skip: idx++; } rcu_read_unlock(); cb->args[1] = idx; rules_ops_put(ops); return err; } static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib_rule_hdr *frh; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for fib rule dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); return -EINVAL; } return 0; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int idx = 0, family; if (cb->strict_check) { int err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; } family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); if (ops == NULL) return -EAFNOSUPPORT; dump_rules(skb, cb, ops); return skb->len; } rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; if (dump_rules(skb, cb, ops) < 0) break; cb->args[1] = 0; skip: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid) { struct net *net; struct sk_buff *skb; int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); if (skb == NULL) goto errout; err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); if (err < 0) { /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) rule->iifindex = dev->ifindex; if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) rule->oifindex = dev->ifindex; } } static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) rule->iifindex = -1; if (rule->oifindex == dev->ifindex) rule->oifindex = -1; } } static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_CHANGENAME: list_for_each_entry(ops, &net->rules_ops, list) { detach_rules(&ops->rules_list, dev); attach_rules(&ops->rules_list, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; } return NOTIFY_DONE; } static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); return 0; } static void __net_exit fib_rules_net_exit(struct net *net) { WARN_ON_ONCE(!list_empty(&net->rules_ops)); } static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, }; static int __init fib_rules_init(void) { int err; rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) goto fail; err = register_netdevice_notifier(&fib_rules_notifier); if (err < 0) goto fail_unregister; return 0; fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); rtnl_unregister(PF_UNSPEC, RTM_DELRULE); rtnl_unregister(PF_UNSPEC, RTM_GETRULE); return err; } subsys_initcall(fib_rules_init);
63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SPINLOCK_H #define __LINUX_SPINLOCK_H /* * include/linux/spinlock.h - generic spinlock/rwlock declarations * * here's the role of the various spinlock/rwlock related include files: * * on SMP builds: * * asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the * initializers * * linux/spinlock_types_raw: * The raw types and initializers * linux/spinlock_types.h: * defines the generic type and initializers * * asm/spinlock.h: contains the arch_spin_*()/etc. lowlevel * implementations, mostly inline assembly code * * (also included on UP-debug builds:) * * linux/spinlock_api_smp.h: * contains the prototypes for the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. * * on UP builds: * * linux/spinlock_type_up.h: * contains the generic, simplified UP spinlock type. * (which is an empty structure on non-debug builds) * * linux/spinlock_types_raw: * The raw RT types and initializers * linux/spinlock_types.h: * defines the generic type and initializers * * linux/spinlock_up.h: * contains the arch_spin_*()/etc. version of UP * builds. (which are NOPs on non-debug, non-preempt * builds) * * (included on UP-non-debug builds:) * * linux/spinlock_api_up.h: * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. */ #include <linux/typecheck.h> #include <linux/preempt.h> #include <linux/linkage.h> #include <linux/compiler.h> #include <linux/irqflags.h> #include <linux/thread_info.h> #include <linux/kernel.h> #include <linux/stringify.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <asm/barrier.h> #include <asm/mmiowb.h> /* * Must define these before including other files, inline functions need them */ #define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME #define LOCK_SECTION_START(extra) \ ".subsection 1\n\t" \ extra \ ".ifndef " LOCK_SECTION_NAME "\n\t" \ LOCK_SECTION_NAME ":\n\t" \ ".endif\n" #define LOCK_SECTION_END \ ".previous\n\t" #define __lockfunc __section(".spinlock.text") /* * Pull the arch_spinlock_t and arch_rwlock_t definitions: */ #include <linux/spinlock_types.h> /* * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them): */ #ifdef CONFIG_SMP # include <asm/spinlock.h> #else # include <linux/spinlock_up.h> #endif #ifdef CONFIG_DEBUG_SPINLOCK extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner); # define raw_spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN); \ } while (0) #else # define raw_spin_lock_init(lock) \ do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) #endif #define raw_spin_is_locked(lock) arch_spin_is_locked(&(lock)->raw_lock) #ifdef arch_spin_is_contended #define raw_spin_is_contended(lock) arch_spin_is_contended(&(lock)->raw_lock) #else #define raw_spin_is_contended(lock) (((void)(lock), 0)) #endif /*arch_spin_is_contended*/ /* * smp_mb__after_spinlock() provides the equivalent of a full memory barrier * between program-order earlier lock acquisitions and program-order later * memory accesses. * * This guarantees that the following two properties hold: * * 1) Given the snippet: * * { X = 0; Y = 0; } * * CPU0 CPU1 * * WRITE_ONCE(X, 1); WRITE_ONCE(Y, 1); * spin_lock(S); smp_mb(); * smp_mb__after_spinlock(); r1 = READ_ONCE(X); * r0 = READ_ONCE(Y); * spin_unlock(S); * * it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0) * and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments * preceding the call to smp_mb__after_spinlock() in __schedule() and in * try_to_wake_up(). * * 2) Given the snippet: * * { X = 0; Y = 0; } * * CPU0 CPU1 CPU2 * * spin_lock(S); spin_lock(S); r1 = READ_ONCE(Y); * WRITE_ONCE(X, 1); smp_mb__after_spinlock(); smp_rmb(); * spin_unlock(S); r0 = READ_ONCE(X); r2 = READ_ONCE(X); * WRITE_ONCE(Y, 1); * spin_unlock(S); * * it is forbidden that CPU0's critical section executes before CPU1's * critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1) * and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments * preceding the calls to smp_rmb() in try_to_wake_up() for similar * snippets but "projected" onto two CPUs. * * Property (2) upgrades the lock to an RCsc lock. * * Since most load-store architectures implement ACQUIRE with an smp_mb() after * the LL/SC loop, they need no further barriers. Similarly all our TSO * architectures imply an smp_mb() for each atomic instruction and equally don't * need more. * * Architectures that can implement ACQUIRE better need to take care. */ #ifndef smp_mb__after_spinlock #define smp_mb__after_spinlock() do { } while (0) #endif #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); #define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock) extern int do_raw_spin_trylock(raw_spinlock_t *lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) { __acquire(lock); arch_spin_lock(&lock->raw_lock); mmiowb_spin_lock(); } #ifndef arch_spin_lock_flags #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #endif static inline void do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock) { __acquire(lock); arch_spin_lock_flags(&lock->raw_lock, *flags); mmiowb_spin_lock(); } static inline int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&(lock)->raw_lock); if (ret) mmiowb_spin_lock(); return ret; } static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) { mmiowb_spin_unlock(); arch_spin_unlock(&lock->raw_lock); __release(lock); } #endif /* * Define the various spin_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The * various methods are defined as nops in the case they are not * required. */ #define raw_spin_trylock(lock) __cond_lock(lock, _raw_spin_trylock(lock)) #define raw_spin_lock(lock) _raw_spin_lock(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock_nested(lock, subclass) # define raw_spin_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\ _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #else /* * Always evaluate the 'subclass' argument to avoid that the compiler * warns about set-but-not-used variables when building with * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1. */ # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave_nested(lock, subclass); \ } while (0) #else #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #endif #else #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_lock_irqsave(lock, flags); \ } while (0) #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ raw_spin_lock_irqsave(lock, flags) #endif #define raw_spin_lock_irq(lock) _raw_spin_lock_irq(lock) #define raw_spin_lock_bh(lock) _raw_spin_lock_bh(lock) #define raw_spin_unlock(lock) _raw_spin_unlock(lock) #define raw_spin_unlock_irq(lock) _raw_spin_unlock_irq(lock) #define raw_spin_unlock_irqrestore(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_unlock_irqrestore(lock, flags); \ } while (0) #define raw_spin_unlock_bh(lock) _raw_spin_unlock_bh(lock) #define raw_spin_trylock_bh(lock) \ __cond_lock(lock, _raw_spin_trylock_bh(lock)) #define raw_spin_trylock_irq(lock) \ ({ \ local_irq_disable(); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_enable(); 0; }); \ }) #define raw_spin_trylock_irqsave(lock, flags) \ ({ \ local_irq_save(flags); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_restore(flags); 0; }); \ }) #ifndef CONFIG_PREEMPT_RT /* Include rwlock functions for !RT */ #include <linux/rwlock.h> #endif /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) # include <linux/spinlock_api_smp.h> #else # include <linux/spinlock_api_up.h> #endif /* Non PREEMPT_RT kernel, map to raw spinlocks: */ #ifndef CONFIG_PREEMPT_RT /* * Map the spin_lock functions to the raw variants for PREEMPT_RT=n */ static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock) { return &lock->rlock; } #ifdef CONFIG_DEBUG_SPINLOCK # define spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ __raw_spin_lock_init(spinlock_check(lock), \ #lock, &__key, LD_WAIT_CONFIG); \ } while (0) #else # define spin_lock_init(_lock) \ do { \ spinlock_check(_lock); \ *(_lock) = __SPIN_LOCK_UNLOCKED(_lock); \ } while (0) #endif static __always_inline void spin_lock(spinlock_t *lock) { raw_spin_lock(&lock->rlock); } static __always_inline void spin_lock_bh(spinlock_t *lock) { raw_spin_lock_bh(&lock->rlock); } static __always_inline int spin_trylock(spinlock_t *lock) { return raw_spin_trylock(&lock->rlock); } #define spin_lock_nested(lock, subclass) \ do { \ raw_spin_lock_nested(spinlock_check(lock), subclass); \ } while (0) #define spin_lock_nest_lock(lock, nest_lock) \ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ } while (0) static __always_inline void spin_lock_irq(spinlock_t *lock) { raw_spin_lock_irq(&lock->rlock); } #define spin_lock_irqsave(lock, flags) \ do { \ raw_spin_lock_irqsave(spinlock_check(lock), flags); \ } while (0) #define spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ } while (0) static __always_inline void spin_unlock(spinlock_t *lock) { raw_spin_unlock(&lock->rlock); } static __always_inline void spin_unlock_bh(spinlock_t *lock) { raw_spin_unlock_bh(&lock->rlock); } static __always_inline void spin_unlock_irq(spinlock_t *lock) { raw_spin_unlock_irq(&lock->rlock); } static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { raw_spin_unlock_irqrestore(&lock->rlock, flags); } static __always_inline int spin_trylock_bh(spinlock_t *lock) { return raw_spin_trylock_bh(&lock->rlock); } static __always_inline int spin_trylock_irq(spinlock_t *lock) { return raw_spin_trylock_irq(&lock->rlock); } #define spin_trylock_irqsave(lock, flags) \ ({ \ raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ }) /** * spin_is_locked() - Check whether a spinlock is locked. * @lock: Pointer to the spinlock. * * This function is NOT required to provide any memory ordering * guarantees; it could be used for debugging purposes or, when * additional synchronization is needed, accompanied with other * constructs (memory barriers) enforcing the synchronization. * * Returns: 1 if @lock is locked, 0 otherwise. * * Note that the function only tells you that the spinlock is * seen to be locked, not that it is locked on your CPU. * * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n, * the return value is always 0 (see include/linux/spinlock_up.h). * Therefore you should not rely heavily on the return value. */ static __always_inline int spin_is_locked(spinlock_t *lock) { return raw_spin_is_locked(&lock->rlock); } static __always_inline int spin_is_contended(spinlock_t *lock) { return raw_spin_is_contended(&lock->rlock); } #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) #else /* !CONFIG_PREEMPT_RT */ # include <linux/spinlock_rt.h> #endif /* CONFIG_PREEMPT_RT */ /* * Pull the atomic_t declaration: * (asm-mips/atomic.h needs above definitions) */ #include <linux/atomic.h> /** * atomic_dec_and_lock - lock on reaching reference count zero * @atomic: the atomic counter * @lock: the spinlock in question * * Decrements @atomic by 1. If the result is 0, returns true and locks * @lock. Returns false for all other cases. */ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); #define atomic_dec_and_lock(atomic, lock) \ __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags); #define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, size_t max_size, unsigned int cpu_mult, gfp_t gfp, const char *name, struct lock_class_key *key); #define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp) \ ({ \ static struct lock_class_key key; \ int ret; \ \ ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size, \ cpu_mult, gfp, #locks, &key); \ ret; \ }) void free_bucket_spinlocks(spinlock_t *locks); #endif /* __LINUX_SPINLOCK_H */
109 233 667 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 /* SPDX-License-Identifier: GPL-2.0 */ /* * Connection state tracking for netfilter. This is separated from, * but required by, the (future) NAT layer; it can also be used by an iptables * extension. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack.h */ #ifndef _NF_CONNTRACK_H #define _NF_CONNTRACK_H #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/nf_conntrack_dccp.h> #include <linux/netfilter/nf_conntrack_sctp.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <net/netfilter/nf_conntrack_tuple.h> struct nf_ct_udp { unsigned long stream_ts; }; /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ struct nf_ct_dccp dccp; struct ip_ct_sctp sctp; struct ip_ct_tcp tcp; struct nf_ct_udp udp; struct nf_ct_gre gre; unsigned int tmpl_padto; }; union nf_conntrack_expect_proto { /* insert expect proto private data here */ }; struct nf_conntrack_net { /* only used when new connection is allocated: */ atomic_t count; unsigned int expect_count; u8 sysctl_auto_assign_helper; bool auto_assign_helper_warned; /* only used from work queues, configuration plane, and so on: */ unsigned int users4; unsigned int users6; unsigned int users_bridge; #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS struct delayed_work ecache_dwork; struct netns_ct *ct_net; #endif }; #include <linux/types.h> #include <linux/skbuff.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> struct nf_conn { /* Usage count in here is 1 for hash table, 1 per skb, * plus 1 for any connection(s) we are `master' for * * Hint, SKB address this struct and refcnt via skb->_nfct and * helpers nf_conntrack_get() and nf_conntrack_put(). * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt, * except that the latter uses internal indirection and does not * result in a conntrack module dependency. * beware nf_ct_get() is different and don't inc refcnt. */ struct nf_conntrack ct_general; spinlock_t lock; /* jiffies32 when this ct is considered dead */ u32 timeout; #ifdef CONFIG_NF_CONNTRACK_ZONES struct nf_conntrack_zone zone; #endif /* XXX should I move this to the tail ? - Y.K */ /* These are my tuples; original and reply */ struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; /* Have we seen traffic both ways yet? (bitset) */ unsigned long status; u16 cpu; possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) struct hlist_node nat_bysource; #endif /* all members below initialized via memset */ struct { } __nfct_init_offset; /* If we were expected by an expectation, this will be it */ struct nf_conn *master; #if defined(CONFIG_NF_CONNTRACK_MARK) u_int32_t mark; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK u_int32_t secmark; #endif /* Extensions */ struct nf_ct_ext *ext; /* Storage reserved for other modules, must be the last member */ union nf_conntrack_proto proto; }; static inline struct nf_conn * nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash) { return container_of(hash, struct nf_conn, tuplehash[hash->tuple.dst.dir]); } static inline u_int16_t nf_ct_l3num(const struct nf_conn *ct) { return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; } static inline u_int8_t nf_ct_protonum(const struct nf_conn *ct) { return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; } #define nf_ct_tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) /* get master conntrack via master expectation */ #define master_ct(conntr) (conntr->master) extern struct net init_net; static inline struct net *nf_ct_net(const struct nf_conn *ct) { return read_pnet(&ct->ct_net); } /* Alter reply tuple (maybe alter helper). */ void nf_conntrack_alter_reply(struct nf_conn *ct, const struct nf_conntrack_tuple *newreply); /* Is this tuple taken? (ignoring any belonging to the given conntrack). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack); /* Return conntrack_info and tuple hash for given skb. */ static inline struct nf_conn * nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) { unsigned long nfct = skb_get_nfct(skb); *ctinfo = nfct & NFCT_INFOMASK; return (struct nf_conn *)(nfct & NFCT_PTRMASK); } void nf_ct_destroy(struct nf_conntrack *nfct); /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { if (ct && refcount_dec_and_test(&ct->ct_general.use)) nf_ct_destroy(&ct->ct_general); } /* Protocol module loading */ int nf_ct_l3proto_try_module_get(unsigned short l3proto); void nf_ct_l3proto_module_put(unsigned short l3proto); /* load module; enable/disable conntrack in this namespace */ int nf_ct_netns_get(struct net *net, u8 nfproto); void nf_ct_netns_put(struct net *net, u8 nfproto); /* * Allocate a hashtable of hlist_head (if nulls == 0), * or hlist_nulls_head (if nulls == 1) */ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls); int nf_conntrack_hash_check_insert(struct nf_conn *ct); bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple); void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, u32 extra_jiffies, bool do_acct); /* Refresh conntrack for this many jiffies and do accounting */ static inline void nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, u32 extra_jiffies) { __nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, true); } /* Refresh conntrack for this many jiffies */ static inline void nf_ct_refresh(struct nf_conn *ct, const struct sk_buff *skb, u32 extra_jiffies) { __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, false); } /* kill conntrack and do accounting */ bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb); /* kill conntrack without accounting */ static inline bool nf_ct_kill(struct nf_conn *ct) { return nf_ct_delete(ct, 0, 0); } /* Set all unconfirmed conntrack as dying */ void nf_ct_unconfirmed_destroy(struct net *); /* Iterate over all conntracks: if iter returns true, it's deleted. */ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report); /* also set unconfirmed conntracks as dying. Only use in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data); struct nf_conntrack_zone; void nf_conntrack_free(struct nf_conn *ct); struct nf_conn *nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp); static inline int nf_ct_is_template(const struct nf_conn *ct) { return test_bit(IPS_TEMPLATE_BIT, &ct->status); } /* It's confirmed if it is, or has been in the hash table. */ static inline int nf_ct_is_confirmed(const struct nf_conn *ct) { return test_bit(IPS_CONFIRMED_BIT, &ct->status); } static inline int nf_ct_is_dying(const struct nf_conn *ct) { return test_bit(IPS_DYING_BIT, &ct->status); } /* Packet is received from loopback */ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) { return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } #define nfct_time_stamp ((u32)(jiffies)) /* jiffies until ct expires, 0 if already expired */ static inline unsigned long nf_ct_expires(const struct nf_conn *ct) { s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; return timeout > 0 ? timeout : 0; } static inline bool nf_ct_is_expired(const struct nf_conn *ct) { return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0; } /* use after obtaining a reference count */ static inline bool nf_ct_should_gc(const struct nf_conn *ct) { return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct); } #define NF_CT_DAY (86400 * HZ) /* Set an arbitrary timeout large enough not to ever expire, this save * us a check for the IPS_OFFLOAD_BIT from the packet path via * nf_ct_is_expired(). */ static inline void nf_ct_offload_timeout(struct nf_conn *ct) { if (nf_ct_expires(ct) < NF_CT_DAY / 2) WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY); } struct kernel_param; int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp); int nf_conntrack_hash_resize(unsigned int hashsize); extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; extern seqcount_spinlock_t nf_conntrack_generation; extern unsigned int nf_conntrack_max; /* must be called with rcu read lock held */ static inline void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) { struct hlist_nulls_head *hptr; unsigned int sequence, hsz; do { sequence = read_seqcount_begin(&nf_conntrack_generation); hsz = nf_conntrack_htable_size; hptr = nf_conntrack_hash; } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); *hash = hptr; *hsize = hsz; } struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags); void nf_ct_tmpl_free(struct nf_conn *tmpl); u32 nf_ct_get_id(const struct nf_conn *ct); u32 nf_conntrack_count(const struct net *net); static inline void nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) { skb_set_nfct(skb, (unsigned long)ct | info); } extern unsigned int nf_conntrack_net_id; static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net) { return net_generic(net, nf_conntrack_net_id); } #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v)) #define MODULE_ALIAS_NFCT_HELPER(helper) \ MODULE_ALIAS("nfct-helper-" helper) #endif /* _NF_CONNTRACK_H */
1083 1046 57 24 10 10 9 10 1 1 30 62 25 33 8 58 43 12 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic parts * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/init.h> #include <linux/llc.h> #include <net/llc.h> #include <net/stp.h> #include <net/switchdev.h> #include "br_private.h" /* * Handle changes in state of network devices enslaved to a bridge. * * Note: don't care about up/down if bridge itself is down, because * port state is checked when bridge is brought up. */ static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; bool notified = false; bool changed_addr; int err; if (dev->priv_flags & IFF_EBRIDGE) { err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); if (event == NETDEV_REGISTER) { /* register of bridge completed, add sysfs entries */ err = br_sysfs_addbr(dev); if (err) return notifier_from_errno(err); return NOTIFY_DONE; } } /* not a port of a bridge */ p = br_port_get_rtnl(dev); if (!p) return NOTIFY_DONE; br = p->br; switch (event) { case NETDEV_CHANGEMTU: br_mtu_auto_adjust(br); break; case NETDEV_PRE_CHANGEADDR: if (br->dev->addr_assign_type == NET_ADDR_SET) break; prechaddr_info = ptr; err = dev_pre_changeaddr_notify(br->dev, prechaddr_info->dev_addr, extack); if (err) return notifier_from_errno(err); break; case NETDEV_CHANGEADDR: spin_lock_bh(&br->lock); br_fdb_changeaddr(p, dev->dev_addr); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); break; case NETDEV_CHANGE: br_port_carrier_check(p, &notified); break; case NETDEV_FEAT_CHANGE: netdev_update_features(br->dev); break; case NETDEV_DOWN: spin_lock_bh(&br->lock); if (br->dev->flags & IFF_UP) { br_stp_disable_port(p); notified = true; } spin_unlock_bh(&br->lock); break; case NETDEV_UP: if (netif_running(br->dev) && netif_oper_up(dev)) { spin_lock_bh(&br->lock); br_stp_enable_port(p); notified = true; spin_unlock_bh(&br->lock); } break; case NETDEV_UNREGISTER: br_del_if(br, dev); break; case NETDEV_CHANGENAME: err = br_sysfs_renameif(p); if (err) return notifier_from_errno(err); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, br->dev); break; } if (event != NETDEV_UNREGISTER) br_vlan_port_event(p, event); /* Events that may cause spanning tree to refresh */ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP || event == NETDEV_CHANGE || event == NETDEV_DOWN)) br_ifinfo_notify(RTM_NEWLINK, NULL, p); return NOTIFY_DONE; } static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; /* called with RTNL or RCU */ static int br_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; p = br_port_get_rtnl_rcu(dev); if (!p) goto out; br = p->br; switch (event) { case SWITCHDEV_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, fdb_info->vid, false); if (err) { err = notifier_from_errno(err); break; } br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, true); break; case SWITCHDEV_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, fdb_info->vid, false); if (err) err = notifier_from_errno(err); break; case SWITCHDEV_FDB_OFFLOADED: fdb_info = ptr; br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_FLUSH_TO_BRIDGE: fdb_info = ptr; /* Don't delete static entries */ br_fdb_delete_by_port(br, p, fdb_info->vid, 0); break; } out: return err; } static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; /* called under rtnl_mutex */ static int br_switchdev_blocking_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct switchdev_notifier_brport_info *brport_info; const struct switchdev_brport *b; struct net_bridge_port *p; int err = NOTIFY_DONE; p = br_port_get_rtnl(dev); if (!p) goto out; switch (event) { case SWITCHDEV_BRPORT_OFFLOADED: brport_info = ptr; b = &brport_info->brport; err = br_switchdev_port_offload(p, b->dev, b->ctx, b->atomic_nb, b->blocking_nb, b->tx_fwd_offload, extack); err = notifier_from_errno(err); break; case SWITCHDEV_BRPORT_UNOFFLOADED: brport_info = ptr; b = &brport_info->brport; br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, b->blocking_nb); break; } out: return err; } static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device * @opt: id of the option to change * @on: new option value * @extack: extack for error messages * * Changes the value of the respective boolean option to @on taking care of * any internal option value mapping and configuration. */ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack) { int err = 0; switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: br_opt_toggle(br, BROPT_NO_LL_LEARN, on); break; case BR_BOOLOPT_MCAST_VLAN_SNOOPING: err = br_multicast_toggle_vlan_snooping(br, on, extack); break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return err; } int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) { switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: return br_opt_get(br, BROPT_NO_LL_LEARN); case BR_BOOLOPT_MCAST_VLAN_SNOOPING: return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return 0; } int br_boolopt_multi_toggle(struct net_bridge *br, struct br_boolopt_multi *bm, struct netlink_ext_ack *extack) { unsigned long bitmap = bm->optmask; int err = 0; int opt_id; for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); err = br_boolopt_toggle(br, opt_id, on, extack); if (err) { br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n", opt_id, br_boolopt_get(br, opt_id), on, err); break; } } return err; } void br_boolopt_multi_get(const struct net_bridge *br, struct br_boolopt_multi *bm) { u32 optval = 0; int opt_id; for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++) optval |= (br_boolopt_get(br, opt_id) << opt_id); bm->optval = optval; bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0); } /* private bridge options, controlled by the kernel */ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) { bool cur = !!br_opt_get(br, opt); br_debug(br, "toggle option: %d state: %d -> %d\n", opt, cur, on); if (cur == on) return; if (on) set_bit(opt, &br->options); else clear_bit(opt, &br->options); } static void __net_exit br_net_exit(struct net *net) { struct net_device *dev; LIST_HEAD(list); rtnl_lock(); for_each_netdev(net, dev) if (dev->priv_flags & IFF_EBRIDGE) br_dev_delete(dev, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations br_net_ops = { .exit = br_net_exit, }; static const struct stp_proto br_stp_proto = { .rcv = br_stp_rcv, }; static int __init br_init(void) { int err; BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > sizeof_field(struct sk_buff, cb)); err = stp_proto_register(&br_stp_proto); if (err < 0) { pr_err("bridge: can't register sap for STP\n"); return err; } err = br_fdb_init(); if (err) goto err_out; err = register_pernet_subsys(&br_net_ops); if (err) goto err_out1; err = br_nf_core_init(); if (err) goto err_out2; err = register_netdevice_notifier(&br_device_notifier); if (err) goto err_out3; err = register_switchdev_notifier(&br_switchdev_notifier); if (err) goto err_out4; err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); if (err) goto err_out5; err = br_netlink_init(); if (err) goto err_out6; brioctl_set(br_ioctl_stub); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = br_fdb_test_addr; #endif #if IS_MODULE(CONFIG_BRIDGE_NETFILTER) pr_info("bridge: filtering via arp/ip/ip6tables is no longer available " "by default. Update your scripts to load br_netfilter if you " "need this.\n"); #endif return 0; err_out6: unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); err_out5: unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: br_fdb_fini(); err_out: stp_proto_unregister(&br_stp_proto); return err; } static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif br_fdb_fini(); } module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); MODULE_VERSION(BR_VERSION); MODULE_ALIAS_RTNL_LINK("bridge");
560 560 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/module.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfctrl.h> #include <net/caif/cfmuxl.h> #include <net/caif/cffrml.h> #include <net/caif/cfserl.h> #include <net/caif/cfsrvl.h> #include <net/caif/caif_dev.h> #define container_obj(layr) container_of(layr, struct cfcnfg, layer) /* Information about CAIF physical interfaces held by Config Module in order * to manage physical interfaces */ struct cfcnfg_phyinfo { struct list_head node; bool up; /* Pointer to the layer below the MUX (framing layer) */ struct cflayer *frm_layer; /* Pointer to the lowest actual physical layer */ struct cflayer *phy_layer; /* Unique identifier of the physical interface */ unsigned int id; /* Preference of the physical in interface */ enum cfcnfg_phy_preference pref; /* Information about the physical device */ struct dev_info dev_info; /* Interface index */ int ifindex; /* Protocol head room added for CAIF link layer */ int head_room; /* Use Start of frame checksum */ bool use_fcs; }; struct cfcnfg { struct cflayer layer; struct cflayer *ctrl; struct cflayer *mux; struct list_head phys; struct mutex lock; }; static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer); static void cfctrl_resp_func(void); static void cfctrl_enum_resp(void); struct cfcnfg *cfcnfg_create(void) { struct cfcnfg *this; struct cfctrl_rsp *resp; might_sleep(); /* Initiate this layer */ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); if (!this) return NULL; this->mux = cfmuxl_create(); if (!this->mux) goto out_of_mem; this->ctrl = cfctrl_create(); if (!this->ctrl) goto out_of_mem; /* Initiate response functions */ resp = cfctrl_get_respfuncs(this->ctrl); resp->enum_rsp = cfctrl_enum_resp; resp->linkerror_ind = cfctrl_resp_func; resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp; resp->sleep_rsp = cfctrl_resp_func; resp->wake_rsp = cfctrl_resp_func; resp->restart_rsp = cfctrl_resp_func; resp->radioset_rsp = cfctrl_resp_func; resp->linksetup_rsp = cfcnfg_linkup_rsp; resp->reject_rsp = cfcnfg_reject_rsp; INIT_LIST_HEAD(&this->phys); cfmuxl_set_uplayer(this->mux, this->ctrl, 0); layer_set_dn(this->ctrl, this->mux); layer_set_up(this->ctrl, this); mutex_init(&this->lock); return this; out_of_mem: synchronize_rcu(); kfree(this->mux); kfree(this->ctrl); kfree(this); return NULL; } void cfcnfg_remove(struct cfcnfg *cfg) { might_sleep(); if (cfg) { synchronize_rcu(); kfree(cfg->mux); cfctrl_remove(cfg->ctrl); kfree(cfg); } } static void cfctrl_resp_func(void) { } static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg, u8 phyid) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->id == phyid) return phy; return NULL; } static void cfctrl_enum_resp(void) { } static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg, enum cfcnfg_phy_preference phy_pref) { /* Try to match with specified preference */ struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) { if (phy->up && phy->pref == phy_pref && phy->frm_layer != NULL) return &phy->dev_info; } /* Otherwise just return something */ list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->up) return &phy->dev_info; return NULL; } static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->ifindex == ifi && phy->up) return phy->id; return -ENODEV; } int caif_disconnect_client(struct net *net, struct cflayer *adap_layer) { u8 channel_id; struct cfcnfg *cfg = get_cfcnfg(net); caif_assert(adap_layer != NULL); cfctrl_cancel_req(cfg->ctrl, adap_layer); channel_id = adap_layer->id; if (channel_id != 0) { struct cflayer *servl; servl = cfmuxl_remove_uplayer(cfg->mux, channel_id); cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer); if (servl != NULL) layer_set_up(servl, NULL); } else pr_debug("nothing to disconnect\n"); /* Do RCU sync before initiating cleanup */ synchronize_rcu(); if (adap_layer->ctrlcmd != NULL) adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0); return 0; } EXPORT_SYMBOL(caif_disconnect_client); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id) { } static const int protohead[CFCTRL_SRV_MASK] = { [CFCTRL_SRV_VEI] = 4, [CFCTRL_SRV_DATAGRAM] = 7, [CFCTRL_SRV_UTIL] = 4, [CFCTRL_SRV_RFM] = 3, [CFCTRL_SRV_DBG] = 3, }; static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, struct caif_connect_request *s, struct cfctrl_link_param *l) { struct dev_info *dev_info; enum cfcnfg_phy_preference pref; int res; memset(l, 0, sizeof(*l)); /* In caif protocol low value is high priority */ l->priority = CAIF_PRIO_MAX - s->priority + 1; if (s->ifindex != 0) { res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex); if (res < 0) return res; l->phyid = res; } else { switch (s->link_selector) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: return -EINVAL; } dev_info = cfcnfg_get_phyid(cnfg, pref); if (dev_info == NULL) return -ENODEV; l->phyid = dev_info->id; } switch (s->protocol) { case CAIFPROTO_AT: l->linktype = CFCTRL_SRV_VEI; l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3; l->chtype = s->sockaddr.u.at.type & 0x3; break; case CAIFPROTO_DATAGRAM: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_DATAGRAM_LOOP: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x03; l->endpoint = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; strlcpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; strlcpy(l->u.utility.name, s->sockaddr.u.util.service, sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) l->u.utility.paramlen = sizeof(l->u.utility.params); memcpy(l->u.utility.params, s->param.data, l->u.utility.paramlen); break; case CAIFPROTO_DEBUG: l->linktype = CFCTRL_SRV_DBG; l->endpoint = s->sockaddr.u.dbg.service; l->chtype = s->sockaddr.u.dbg.type; break; default: return -EINVAL; } return 0; } int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, struct cflayer *adap_layer, int *ifindex, int *proto_head, int *proto_tail) { struct cflayer *frml; struct cfcnfg_phyinfo *phy; int err; struct cfctrl_link_param param; struct cfcnfg *cfg = get_cfcnfg(net); rcu_read_lock(); err = caif_connect_req_to_link_param(cfg, conn_req, &param); if (err) goto unlock; phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid); if (!phy) { err = -ENODEV; goto unlock; } err = -EINVAL; if (adap_layer == NULL) { pr_err("adap_layer is zero\n"); goto unlock; } if (adap_layer->receive == NULL) { pr_err("adap_layer->receive is NULL\n"); goto unlock; } if (adap_layer->ctrlcmd == NULL) { pr_err("adap_layer->ctrlcmd == NULL\n"); goto unlock; } err = -ENODEV; frml = phy->frm_layer; if (frml == NULL) { pr_err("Specified PHY type does not exist!\n"); goto unlock; } caif_assert(param.phyid == phy->id); caif_assert(phy->frm_layer->id == param.phyid); caif_assert(phy->phy_layer->id == param.phyid); *ifindex = phy->ifindex; *proto_tail = 2; *proto_head = protohead[param.linktype] + phy->head_room; rcu_read_unlock(); /* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */ cfctrl_enum_req(cfg->ctrl, param.phyid); return cfctrl_linkup_request(cfg->ctrl, &param, adap_layer); unlock: rcu_read_unlock(); return err; } EXPORT_SYMBOL(caif_connect_client); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer) { if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); } static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer) { struct cfcnfg *cnfg = container_obj(layer); struct cflayer *servicel = NULL; struct cfcnfg_phyinfo *phyinfo; struct net_device *netdev; if (channel_id == 0) { pr_warn("received channel_id zero\n"); if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); return; } rcu_read_lock(); if (adapt_layer == NULL) { pr_debug("link setup response but no client exist, send linkdown back\n"); cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); goto unlock; } caif_assert(cnfg != NULL); caif_assert(phyid != 0); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { pr_err("ERROR: Link Layer Device disappeared while connecting\n"); goto unlock; } caif_assert(phyinfo != NULL); caif_assert(phyinfo->id == phyid); caif_assert(phyinfo->phy_layer != NULL); caif_assert(phyinfo->phy_layer->id == phyid); adapt_layer->id = channel_id; switch (serv) { case CFCTRL_SRV_VEI: servicel = cfvei_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DATAGRAM: servicel = cfdgml_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_RFM: netdev = phyinfo->dev_info.dev; servicel = cfrfml_create(channel_id, &phyinfo->dev_info, netdev->mtu); break; case CFCTRL_SRV_UTIL: servicel = cfutill_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_VIDEO: servicel = cfvidl_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DBG: servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); break; default: pr_err("Protocol error. Link setup response - unknown channel type\n"); goto unlock; } if (!servicel) goto unlock; layer_set_dn(servicel, cnfg->mux); cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id); layer_set_up(servicel, adapt_layer); layer_set_dn(adapt_layer, servicel); rcu_read_unlock(); servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0); return; unlock: rcu_read_unlock(); } int cfcnfg_add_phy_layer(struct cfcnfg *cnfg, struct net_device *dev, struct cflayer *phy_layer, enum cfcnfg_phy_preference pref, struct cflayer *link_support, bool fcs, int head_room) { struct cflayer *frml; struct cfcnfg_phyinfo *phyinfo = NULL; int i, res = 0; u8 phyid; mutex_lock(&cnfg->lock); /* CAIF protocol allow maximum 6 link-layers */ for (i = 0; i < 7; i++) { phyid = (dev->ifindex + i) & 0x7; if (phyid == 0) continue; if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL) goto got_phyid; } pr_warn("Too many CAIF Link Layers (max 6)\n"); res = -EEXIST; goto out; got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; goto out; } phy_layer->id = phyid; phyinfo->pref = pref; phyinfo->id = phyid; phyinfo->dev_info.id = phyid; phyinfo->dev_info.dev = dev; phyinfo->phy_layer = phy_layer; phyinfo->ifindex = dev->ifindex; phyinfo->head_room = head_room; phyinfo->use_fcs = fcs; frml = cffrml_create(phyid, fcs); if (!frml) { res = -ENOMEM; goto out_err; } phyinfo->frm_layer = frml; layer_set_up(frml, cnfg->mux); if (link_support != NULL) { link_support->id = phyid; layer_set_dn(frml, link_support); layer_set_up(link_support, frml); layer_set_dn(link_support, phy_layer); layer_set_up(phy_layer, link_support); } else { layer_set_dn(frml, phy_layer); layer_set_up(phy_layer, frml); } list_add_rcu(&phyinfo->node, &cnfg->phys); out: mutex_unlock(&cnfg->lock); return res; out_err: kfree(phyinfo); mutex_unlock(&cnfg->lock); return res; } EXPORT_SYMBOL(cfcnfg_add_phy_layer); int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer, bool up) { struct cfcnfg_phyinfo *phyinfo; rcu_read_lock(); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id); if (phyinfo == NULL) { rcu_read_unlock(); return -ENODEV; } if (phyinfo->up == up) { rcu_read_unlock(); return 0; } phyinfo->up = up; if (up) { cffrml_hold(phyinfo->frm_layer); cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer, phy_layer->id); } else { cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id); cffrml_put(phyinfo->frm_layer); } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(cfcnfg_set_phy_state); int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer) { struct cflayer *frml, *frml_dn; u16 phyid; struct cfcnfg_phyinfo *phyinfo; might_sleep(); mutex_lock(&cnfg->lock); phyid = phy_layer->id; phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { mutex_unlock(&cnfg->lock); return 0; } caif_assert(phyid == phyinfo->id); caif_assert(phy_layer == phyinfo->phy_layer); caif_assert(phy_layer->id == phyid); caif_assert(phyinfo->frm_layer->id == phyid); list_del_rcu(&phyinfo->node); synchronize_rcu(); /* Fail if reference count is not zero */ if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) { pr_info("Wait for device inuse\n"); list_add_rcu(&phyinfo->node, &cnfg->phys); mutex_unlock(&cnfg->lock); return -EAGAIN; } frml = phyinfo->frm_layer; frml_dn = frml->dn; cffrml_set_uplayer(frml, NULL); cffrml_set_dnlayer(frml, NULL); if (phy_layer != frml_dn) { layer_set_up(frml_dn, NULL); layer_set_dn(frml_dn, NULL); } layer_set_up(phy_layer, NULL); if (phyinfo->phy_layer != frml_dn) kfree(frml_dn); cffrml_free(frml); kfree(phyinfo); mutex_unlock(&cnfg->lock); return 0; } EXPORT_SYMBOL(cfcnfg_del_phy_layer);
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/root.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc root directory handling functions */ #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pid_namespace.h> #include <linux/fs_parser.h> #include <linux/cred.h> #include <linux/magic.h> #include <linux/slab.h> #include "internal.h" struct proc_fs_context { struct pid_namespace *pid_ns; unsigned int mask; enum proc_hidepid hidepid; int gid; enum proc_pidonly pidonly; }; enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, }; static const struct fs_parameter_spec proc_fs_parameters[] = { fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), {} }; static inline int valid_hidepid(unsigned int value) { return (value == HIDEPID_OFF || value == HIDEPID_NO_ACCESS || value == HIDEPID_INVISIBLE || value == HIDEPID_NOT_PTRACEABLE); } static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid); struct fs_parse_result result; int base = (unsigned long)hidepid_u32_spec.data; if (param->type != fs_value_is_string) return invalf(fc, "proc: unexpected type of hidepid value\n"); if (!kstrtouint(param->string, base, &result.uint_32)) { if (!valid_hidepid(result.uint_32)) return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); ctx->hidepid = result.uint_32; return 0; } if (!strcmp(param->string, "off")) ctx->hidepid = HIDEPID_OFF; else if (!strcmp(param->string, "noaccess")) ctx->hidepid = HIDEPID_NO_ACCESS; else if (!strcmp(param->string, "invisible")) ctx->hidepid = HIDEPID_INVISIBLE; else if (!strcmp(param->string, "ptraceable")) ctx->hidepid = HIDEPID_NOT_PTRACEABLE; else return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); return 0; } static int proc_parse_subset_param(struct fs_context *fc, char *value) { struct proc_fs_context *ctx = fc->fs_private; while (value) { char *ptr = strchr(value, ','); if (ptr != NULL) *ptr++ = '\0'; if (*value != '\0') { if (!strcmp(value, "pid")) { ctx->pidonly = PROC_PIDONLY_ON; } else { return invalf(fc, "proc: unsupported subset option - %s\n", value); } } value = ptr; } return 0; } static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_gid: ctx->gid = result.uint_32; break; case Opt_hidepid: if (proc_parse_hidepid_param(fc, param)) return -EINVAL; break; case Opt_subset: if (proc_parse_subset_param(fc, param->string) < 0) return -EINVAL; break; default: return -EINVAL; } ctx->mask |= 1 << opt; return 0; } static void proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; if (ctx->mask & (1 << Opt_gid)) fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; struct inode *root_inode; struct proc_fs_info *fs_info; int ret; fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL); if (!fs_info) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; s->s_fs_info = fs_info; /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on * top of it */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; /* procfs dentries and inodes don't require IO to create */ s->s_shrink.seeks = 0; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { pr_err("proc_fill_super: get root inode failed\n"); return -ENOMEM; } s->s_root = d_make_root(root_inode); if (!s->s_root) { pr_err("proc_fill_super: allocate dentry failed\n"); return -ENOMEM; } ret = proc_setup_self(s); if (ret) { return ret; } return proc_setup_thread_self(s); } static int proc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct proc_fs_info *fs_info = proc_sb_info(sb); sync_filesystem(sb); proc_apply_options(fs_info, fc, current_user_ns()); return 0; } static int proc_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, proc_fill_super); } static void proc_fs_context_free(struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; put_pid_ns(ctx->pid_ns); kfree(ctx); } static const struct fs_context_operations proc_fs_context_ops = { .free = proc_fs_context_free, .parse_param = proc_parse_param, .get_tree = proc_get_tree, .reconfigure = proc_reconfigure, }; static int proc_init_fs_context(struct fs_context *fc) { struct proc_fs_context *ctx; ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); fc->fs_private = ctx; fc->ops = &proc_fs_context_ops; return 0; } static void proc_kill_sb(struct super_block *sb) { struct proc_fs_info *fs_info = proc_sb_info(sb); if (!fs_info) { kill_anon_super(sb); return; } dput(fs_info->proc_self); dput(fs_info->proc_thread_self); kill_anon_super(sb); put_pid_ns(fs_info->pid_ns); kfree(fs_info); } static struct file_system_type proc_fs_type = { .name = "proc", .init_fs_context = proc_init_fs_context, .parameters = proc_fs_parameters, .kill_sb = proc_kill_sb, .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, }; void __init proc_root_init(void) { proc_init_kmemcache(); set_proc_pid_nlink(); proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ proc_create_mount_point("openprom"); #endif proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); register_filesystem(&proc_fs_type); } static int proc_root_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { if (!proc_pid_lookup(dentry, flags)) return NULL; return proc_lookup(dir, dentry, flags); } static int proc_root_readdir(struct file *file, struct dir_context *ctx) { if (ctx->pos < FIRST_PROCESS_ENTRY) { int error = proc_readdir(file, ctx); if (unlikely(error <= 0)) return error; ctx->pos = FIRST_PROCESS_ENTRY; } return proc_pid_readdir(file, ctx); } /* * The root /proc directory is special, as it has the * <pid> directories. Thus we don't use the generic * directory handling functions for that.. */ static const struct file_operations proc_root_operations = { .read = generic_read_dir, .iterate_shared = proc_root_readdir, .llseek = generic_file_llseek, }; /* * proc root can do almost nothing.. */ static const struct inode_operations proc_root_inode_operations = { .lookup = proc_root_lookup, .getattr = proc_root_getattr, }; /* * This is the root "inode" in the /proc tree.. */ struct proc_dir_entry proc_root = { .low_ino = PROC_ROOT_INO, .namelen = 5, .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, .name = "/proc", };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NFNETLINK_H #define _NFNETLINK_H #include <linux/netlink.h> #include <linux/capability.h> #include <net/netlink.h> #include <uapi/linux/netfilter/nfnetlink.h> struct nfnl_info { struct net *net; struct sock *sk; const struct nlmsghdr *nlh; const struct nfgenmsg *nfmsg; struct netlink_ext_ack *extack; }; enum nfnl_callback_type { NFNL_CB_UNSPEC = 0, NFNL_CB_MUTEX, NFNL_CB_RCU, NFNL_CB_BATCH, }; struct nfnl_callback { int (*call)(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]); const struct nla_policy *policy; enum nfnl_callback_type type; __u16 attr_count; }; enum nfnl_abort_action { NFNL_ABORT_NONE = 0, NFNL_ABORT_AUTOLOAD, NFNL_ABORT_VALIDATE, }; struct nfnetlink_subsystem { const char *name; __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ const struct nfnl_callback *cb; /* callback for individual types */ struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action); void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); }; int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); int nfnetlink_has_listeners(struct net *net, unsigned int group); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid); void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { return subsys << 8 | msg_type; } static inline void nfnl_fill_hdr(struct nlmsghdr *nlh, u8 family, u8 version, __be16 res_id) { struct nfgenmsg *nfmsg; nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = version; nfmsg->res_id = res_id; } static inline struct nlmsghdr *nfnl_msg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int flags, u8 family, u8 version, __be16 res_id) { struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); if (!nlh) return NULL; nfnl_fill_hdr(nlh, family, version, res_id); return nlh; } void nfnl_lock(__u8 subsys_id); void nfnl_unlock(__u8 subsys_id); #ifdef CONFIG_PROVE_LOCKING bool lockdep_nfnl_is_held(__u8 subsys_id); #else static inline bool lockdep_nfnl_is_held(__u8 subsys_id) { return true; } #endif /* CONFIG_PROVE_LOCKING */ #define MODULE_ALIAS_NFNL_SUBSYS(subsys) \ MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys)) #endif /* _NFNETLINK_H */
443 443 442 2953 2955 2956 1388 669 2957 19 443 443 443 443 442 443 443 443 442 23 23 23 23 25 3 23 25 26 2953 2954 1 2954 1 2953 1548 2956 2939 925 924 23 917 634 2624 2625 327 327 10 1 325 11 319 311 7 4681 4786 48 124 10 10 279 4662 3958 154 1012 340 665 533 2 2 1 34 1 34 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <net/sock.h> #include "internal.h" unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int count) { unsigned int cpy, set; cpy = count / BITS_PER_BYTE; set = (nfdt->max_fds - count) / BITS_PER_BYTE; memcpy(nfdt->open_fds, ofdt->open_fds, cpy); memset((char *)nfdt->open_fds + cpy, 0, set); memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); memset((char *)nfdt->close_on_exec + cpy, 0, set); cpy = BITBIT_SIZE(count); set = BITBIT_SIZE(nfdt->max_fds) - cpy; memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); memset((char *)nfdt->full_fds_bits + cpy, 0, set); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". * * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied * by that "1024/sizeof(ptr)" before, we already know there are sufficient * clear low bits. Clang seems to realize that, gcc ends up being confused. * * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, * let's consider it documentation (and maybe a test-case for gcc to improve * its code generation ;) */ static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. */ nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); nr = ALIGN(nr, BITS_PER_LONG); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. Deal * with that in caller, it's cheaper that way. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return NULL; } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; /* * extremely unlikely race - sysctl_nr_open decreased between the check in * caller and alloc_fdtable(). Cheaper to catch it here... */ if (unlikely(new_fdt->max_fds <= nr)) { __free_fdtable(new_fdt); return -EMFILE; } cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 1; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 when nothing done; 1 when files were * expanded and execution may have blocked. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int expanded = 0; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* All good, so we try */ files->resize_in_progress = true; expanded = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return expanded; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->close_on_exec); } static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } static unsigned int count_open_files(struct fdtable *fdt) { unsigned int size = fdt->max_fds; unsigned int i; /* Find the last open fd */ for (i = size / BITS_PER_LONG; i > 0; ) { if (fdt->open_fds[--i]) break; } i = (i + 1) * BITS_PER_LONG; return i; } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * 'max_fds' will normally already be properly aligned, but it * turns out that in the close_range() -> __close_range() -> * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end * up having a 'max_fds' value that isn't already aligned. * * Rather than make close_range() have to worry about this, * just make that BITS_PER_LONG alignment be part of a sane * fdtable size. Becuase that's really what it is. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) { unsigned int count; count = count_open_files(fdt); if (max_fds < NR_OPEN_DEFAULT) max_fds = NR_OPEN_DEFAULT; return ALIGN(min(count, max_fds), BITS_PER_LONG); } /* * Allocate a new files structure and copy contents from the * passed in files structure. * errorp will be valid only when the returned files_struct is NULL. */ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) goto out; atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, max_fds); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files - 1); if (!new_fdt) { *errorp = -ENOMEM; goto out_release; } /* beyond sysctl_nr_open; nothing to do */ if (unlikely(new_fdt->max_fds < open_files)) { __free_fdtable(new_fdt); *errorp = -EMFILE; goto out_release; } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, max_fds); } copy_fd_bitmaps(new_fdt, old_fdt, open_files); old_fds = old_fdt->fd; new_fds = new_fdt->fd; for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { get_file(f); } else { /* * The fd may be claimed in the fd bitmap but not yet * instantiated in the files array if a sibling thread * is partway through open(). So make sure that this * fd is available to the new process. */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; out_release: kmem_cache_free(files_cachep, newf); out: return NULL; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit > maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (fd < fdt->max_fds) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (fd >= end) goto out; error = expand_files(files, fd); if (error < 0) goto out; /* * If we needed to expand the fs array we * might have blocked - try again. */ if (error) goto repeat; if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); error = fd; #if 1 /* Sanity check */ if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } #endif out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /* * Install a file pointer in the fd array. * * The VFS is full of places where we drop the files lock between * setting the open_fds bitmap and installing the file in the file * array. At any such point, we are vulnerable to a dup2() race * installing a file in the array before us. We need to detect this and * fput() the struct file we are about to overwrite in this case. * * It should never happen - if we allow dup2() do it, _really_ bad things * will follow. * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * pick_file - return file associatd with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * If this functions returns an EINVAL error pointer the fd was beyond the * current maximum number of file descriptors for that fdtable. * * Returns: The file associated with @fd, on error returns an error pointer. */ static struct file *pick_file(struct files_struct *files, unsigned fd) { struct file *file; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); if (fd >= fdt->max_fds) { file = ERR_PTR(-EINVAL); goto out_unlock; } fd = array_index_nospec(fd, fdt->max_fds); file = fdt->fd[fd]; if (!file) { file = ERR_PTR(-EBADF); goto out_unlock; } rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); out_unlock: spin_unlock(&files->file_lock); return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; file = pick_file(files, fd); if (IS_ERR(file)) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /* for ksys_close() */ /** * last_fd - return last valid index into fd table * @cur_fds: files struct * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { while (fd <= max_fd) { struct file *file; file = pick_file(cur_fds, fd++); if (!IS_ERR(file)) { /* found a valid file to close */ filp_close(file, cur_fds); cond_resched(); continue; } /* beyond the last fd in that table */ if (PTR_ERR(file) == -EINVAL) return; } } /** * __close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. */ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if (flags & CLOSE_RANGE_UNSHARE) { int ret; unsigned int max_unshare_fds = NR_OPEN_MAX; /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (!(flags & CLOSE_RANGE_CLOEXEC)) { /* * If the requested range is greater than the current * maximum, we're closing everything so only copy all * file descriptors beneath the lowest file descriptor. */ rcu_read_lock(); if (max_fd >= last_fd(files_fdtable(cur_fds))) max_unshare_fds = fd; rcu_read_unlock(); } ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); if (ret) return ret; /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ if (fds) swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /* * See close_fd_get_file() below, this variant assumes current->files->file_lock * is held. */ int __close_fd_get_file(unsigned int fd, struct file **res) { struct files_struct *files = current->files; struct file *file; struct fdtable *fdt; fdt = files_fdtable(files); if (fd >= fdt->max_fds) goto out_err; file = fdt->fd[fd]; if (!file) goto out_err; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); get_file(file); *res = file; return 0; out_err: *res = NULL; return -ENOENT; } /* * variant of close_fd that gets a ref on the file for later fput. * The caller must ensure that filp_close() called on the file, and then * an fput(). */ int close_fd_get_file(unsigned int fd, struct file **res) { struct files_struct *files = current->files; int ret; spin_lock(&files->file_lock); ret = __close_fd_get_file(fd, res); spin_unlock(&files->file_lock); return ret; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask, unsigned int refs) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; if (unlikely(fd >= fdt->max_fds)) return NULL; fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); file = rcu_dereference_raw(*fdentry); if (unlikely(!file)) return NULL; if (unlikely(file->f_mode & mask)) return NULL; /* * Ok, we have a file pointer. However, because we do * this all locklessly under RCU, we may be racing with * that file being closed. * * Such a race can take two forms: * * (a) the file ref already went down to zero, * and get_file_rcu_many() fails. Just try * again: */ if (unlikely(!get_file_rcu_many(file, refs))) continue; /* * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our refs and try again. */ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || unlikely(rcu_dereference_raw(*fdentry) != file)) { fput_many(file, refs); continue; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask, unsigned int refs) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask, refs); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) { return __fget_files(current->files, fd, mask, refs); } struct file *fget_many(unsigned int fd, unsigned int refs) { return __fget(fd, FMODE_PATH, refs); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH, 1); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0, 1); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0, 1); task_unlock(task); return file; } struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; struct file *file = NULL; task_lock(task); files = task->files; if (files) file = files_lookup_fd_rcu(files, fd); task_unlock(task); return file; } struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { for (; fd < files_fdtable(files)->max_fds; fd++) { file = files_lookup_fd_rcu(files, fd); if (file) break; } } task_unlock(task); *ret_fd = fd; return file; } /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. */ static unsigned long __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (atomic_read_acquire(&files->count) == 1) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; return (unsigned long)file; } else { file = __fget(fd, mask, 1); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; } } unsigned long __fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(__fdget); unsigned long __fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } unsigned long __fdget_pos(unsigned int fd) { unsigned long v = __fdget(fd); struct file *file = (struct file *)(v & ~3); if (file && (file->f_mode & FMODE_ATOMIC_POS)) { if (file_count(file) > 1) { v |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } } return v; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); if (flag) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { struct files_struct *files = current->files; struct fdtable *fdt; bool res; rcu_read_lock(); fdt = files_fdtable(files); res = close_on_exec(fd, fdt); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * We need to detect attempts to do dup2() over allocated but still * not finished descriptor. NB: OpenBSD avoids that at the price of * extra work in their equivalent of fget() - they insert struct * file immediately after grabbing descriptor, mark it larval if * more work (e.g. actual opening) is needed and make sure that * fget() treats larval files as absent. Potentially interesting, * but while extra work in fget() is trivial, locking implications * and amount of surgery on open()-related paths in VFS are not. * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" * deadlocks in rather amusing ways, AFAICS. All of that is out of * scope of POSIX or SUS, since neither considers shared descriptor * tables and this condition does not arise without those. */ fdt = files_fdtable(files); tofree = fdt->fd[fd]; if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; return do_dup2(files, file, fd, flags); out_unlock: spin_unlock(&files->file_lock); return err; } /** * __receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; error = security_file_receive(file); if (error) return error; new_fd = get_unused_fd_flags(o_flags); if (new_fd < 0) return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { put_unused_fd(new_fd); return error; } } fd_install(new_fd, get_file(file)); __receive_sock(file); return new_fd; } int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } int receive_fd(struct file *file, unsigned int o_flags) { return __receive_fd(file, NULL, o_flags); } EXPORT_SYMBOL_GPL(receive_fd); static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; int retval = oldfd; rcu_read_lock(); if (!files_lookup_fd_rcu(files, oldfd)) retval = -EBADF; rcu_read_unlock(); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd);
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct debug_req_info { struct ethnl_req_info base; }; struct debug_reply_data { struct ethnl_reply_data base; u32 msg_mask; }; #define DEBUG_REPDATA(__reply_base) \ container_of(__reply_base, struct debug_reply_data, base) const struct nla_policy ethnl_debug_get_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int debug_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) { struct debug_reply_data *data = DEBUG_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_msglevel) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->msg_mask = dev->ethtool_ops->get_msglevel(dev); ethnl_ops_complete(dev); return 0; } static int debug_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } static int debug_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } const struct ethnl_request_ops ethnl_debug_request_ops = { .request_cmd = ETHTOOL_MSG_DEBUG_GET, .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, .hdr_attr = ETHTOOL_A_DEBUG_HEADER, .req_info_size = sizeof(struct debug_req_info), .reply_data_size = sizeof(struct debug_reply_data), .prepare_data = debug_prepare_data, .reply_size = debug_reply_size, .fill_reply = debug_fill_reply, }; /* DEBUG_SET */ const struct nla_policy ethnl_debug_set_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED }, }; int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info) { struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct net_device *dev; bool mod = false; u32 msg_mask; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_DEBUG_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel) goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; msg_mask = dev->ethtool_ops->get_msglevel(dev); ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT, tb[ETHTOOL_A_DEBUG_MSGMASK], netif_msg_class_names, info->extack, &mod); if (ret < 0 || !mod) goto out_ops; dev->ethtool_ops->set_msglevel(dev, msg_mask); ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); out_dev: dev_put(dev); return ret; }
573 10 10 564 564 564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2019, Tessares SA. */ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/netns/generic.h> #include "protocol.h" #define MPTCP_SYSCTL_PATH "net/mptcp" static int mptcp_pernet_id; struct mptcp_pernet { #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; #endif unsigned int add_addr_timeout; unsigned int stale_loss_cnt; u8 mptcp_enabled; u8 checksum_enabled; u8 allow_join_initial_addr_port; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) { return net_generic(net, mptcp_pernet_id); } int mptcp_is_enabled(const struct net *net) { return mptcp_get_pernet(net)->mptcp_enabled; } unsigned int mptcp_get_add_addr_timeout(const struct net *net) { return mptcp_get_pernet(net)->add_addr_timeout; } int mptcp_is_checksum_enabled(const struct net *net) { return mptcp_get_pernet(net)->checksum_enabled; } int mptcp_allow_join_id0(const struct net *net) { return mptcp_get_pernet(net)->allow_join_initial_addr_port; } unsigned int mptcp_stale_loss_cnt(const struct net *net) { return mptcp_get_pernet(net)->stale_loss_cnt; } static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; pernet->checksum_enabled = 0; pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; } #ifdef CONFIG_SYSCTL static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", .maxlen = sizeof(u8), .mode = 0644, /* users with CAP_NET_ADMIN or root (not and) can change this * value, same as other sysctl or the 'net' tree. */ .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "add_addr_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "checksum_enabled", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "allow_join_initial_addr_port", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "stale_loss_cnt", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, }, {} }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { struct ctl_table_header *hdr; struct ctl_table *table; table = mptcp_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; table[2].data = &pernet->checksum_enabled; table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) goto err_reg; pernet->ctl_table_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) { struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; unregister_net_sysctl_table(pernet->ctl_table_hdr); kfree(table); } #else static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { return 0; } static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} #endif /* CONFIG_SYSCTL */ static int __net_init mptcp_net_init(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); mptcp_pernet_set_defaults(pernet); return mptcp_pernet_new_table(net, pernet); } /* Note: the callback will only be called per extra netns */ static void __net_exit mptcp_net_exit(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); mptcp_pernet_del_table(pernet); } static struct pernet_operations mptcp_pernet_ops = { .init = mptcp_net_init, .exit = mptcp_net_exit, .id = &mptcp_pernet_id, .size = sizeof(struct mptcp_pernet), }; void __init mptcp_init(void) { mptcp_join_cookie_init(); mptcp_proto_init(); if (register_pernet_subsys(&mptcp_pernet_ops) < 0) panic("Failed to register MPTCP pernet subsystem.\n"); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcpv6_init(void) { int err; err = mptcp_proto_v6_init(); return err; } #endif
22 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 // SPDX-License-Identifier: GPL-2.0-or-later /* * kernel/stop_machine.c * * Copyright (C) 2008, 2005 IBM Corporation. * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> */ #include <linux/compiler.h> #include <linux/completion.h> #include <linux/cpu.h> #include <linux/init.h> #include <linux/kthread.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/stop_machine.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/smpboot.h> #include <linux/atomic.h> #include <linux/nmi.h> #include <linux/sched/wake_q.h> /* * Structure to determine completion condition and record errors. May * be shared by works on different cpus. */ struct cpu_stop_done { atomic_t nr_todo; /* nr left to execute */ int ret; /* collected return value */ struct completion completion; /* fired if nr_todo reaches 0 */ }; /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { struct task_struct *thread; raw_spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ struct cpu_stop_work stop_work; /* for stop_cpus */ unsigned long caller; cpu_stop_fn_t fn; }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static bool stop_machine_initialized = false; void print_stop_info(const char *log_lvl, struct task_struct *task) { /* * If @task is a stopper task, it cannot migrate and task_cpu() is * stable. */ struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task)); if (task != stopper->thread) return; printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); } /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); static bool stop_cpus_in_progress; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { memset(done, 0, sizeof(*done)); atomic_set(&done->nr_todo, nr_todo); init_completion(&done->completion); } /* signal completion unless @done is NULL */ static void cpu_stop_signal_done(struct cpu_stop_done *done) { if (atomic_dec_and_test(&done->nr_todo)) complete(&done->completion); } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, struct cpu_stop_work *work, struct wake_q_head *wakeq) { list_add_tail(&work->list, &stopper->works); wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; preempt_disable(); raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) __cpu_stop_queue_work(stopper, work, &wakeq); else if (work->done) cpu_stop_signal_done(work->done); raw_spin_unlock_irqrestore(&stopper->lock, flags); wake_up_q(&wakeq); preempt_enable(); return enabled; } /** * stop_one_cpu - stop a cpu * @cpu: cpu to stop * @fn: function to execute * @arg: argument to @fn * * Execute @fn(@arg) on @cpu. @fn is run in a process context with * the highest priority preempting any task on the cpu and * monopolizing it. This function returns after the execution is * complete. * * This function doesn't guarantee @cpu stays online till @fn * completes. If @cpu goes down in the middle, execution may happen * partially or fully on different cpus. @fn should either be ready * for that or the caller should ensure that @cpu stays online until * this function completes. * * CONTEXT: * Might sleep. * * RETURNS: * -ENOENT if @fn(@arg) was not executed because @cpu was offline; * otherwise, the return value of @fn. */ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) { struct cpu_stop_done done; struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) return -ENOENT; /* * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup * cycle by doing a preemption: */ cond_resched(); wait_for_completion(&done.completion); return done.ret; } /* This controls the threads on each CPU. */ enum multi_stop_state { /* Dummy starting state for thread. */ MULTI_STOP_NONE, /* Awaiting everyone to be scheduled. */ MULTI_STOP_PREPARE, /* Disable interrupts. */ MULTI_STOP_DISABLE_IRQ, /* Run the function */ MULTI_STOP_RUN, /* Exit */ MULTI_STOP_EXIT, }; struct multi_stop_data { cpu_stop_fn_t fn; void *data; /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ unsigned int num_threads; const struct cpumask *active_cpus; enum multi_stop_state state; atomic_t thread_ack; }; static void set_state(struct multi_stop_data *msdata, enum multi_stop_state newstate) { /* Reset ack counter. */ atomic_set(&msdata->thread_ack, msdata->num_threads); smp_wmb(); WRITE_ONCE(msdata->state, newstate); } /* Last one to ack a state moves to the next state. */ static void ack_state(struct multi_stop_data *msdata) { if (atomic_dec_and_test(&msdata->thread_ack)) set_state(msdata, msdata->state + 1); } notrace void __weak stop_machine_yield(const struct cpumask *cpumask) { cpu_relax(); } /* This is the cpu_stop function which stops the CPU. */ static int multi_cpu_stop(void *data) { struct multi_stop_data *msdata = data; enum multi_stop_state newstate, curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; const struct cpumask *cpumask; unsigned long flags; bool is_active; /* * When called from stop_machine_from_inactive_cpu(), irq might * already be disabled. Save the state and restore it on exit. */ local_save_flags(flags); if (!msdata->active_cpus) { cpumask = cpu_online_mask; is_active = cpu == cpumask_first(cpumask); } else { cpumask = msdata->active_cpus; is_active = cpumask_test_cpu(cpu, cpumask); } /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ stop_machine_yield(cpumask); newstate = READ_ONCE(msdata->state); if (newstate != curstate) { curstate = newstate; switch (curstate) { case MULTI_STOP_DISABLE_IRQ: local_irq_disable(); hard_irq_disable(); break; case MULTI_STOP_RUN: if (is_active) err = msdata->fn(msdata->data); break; default: break; } ack_state(msdata); } else if (curstate > MULTI_STOP_PREPARE) { /* * At this stage all other CPUs we depend on must spin * in the same loop. Any reason for hard-lockup should * be detected and reported on their side. */ touch_nmi_watchdog(); } rcu_momentary_dyntick_idle(); } while (curstate != MULTI_STOP_EXIT); local_irq_restore(flags); return err; } static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, int cpu2, struct cpu_stop_work *work2) { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; retry: /* * The waking up of stopper threads has to happen in the same * scheduling context as the queueing. Otherwise, there is a * possibility of one of the above stoppers being woken up by another * CPU, and preempting us. This will cause us to not wake up the other * stopper forever. */ preempt_disable(); raw_spin_lock_irq(&stopper1->lock); raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); if (!stopper1->enabled || !stopper2->enabled) { err = -ENOENT; goto unlock; } /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. * * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has * queued a work on cpu1 but not on cpu2, we hold both locks. * * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ if (unlikely(stop_cpus_in_progress)) { err = -EDEADLK; goto unlock; } err = 0; __cpu_stop_queue_work(stopper1, work1, &wakeq); __cpu_stop_queue_work(stopper2, work2, &wakeq); unlock: raw_spin_unlock(&stopper2->lock); raw_spin_unlock_irq(&stopper1->lock); if (unlikely(err == -EDEADLK)) { preempt_enable(); while (stop_cpus_in_progress) cpu_relax(); goto retry; } wake_up_q(&wakeq); preempt_enable(); return err; } /** * stop_two_cpus - stops two cpus * @cpu1: the cpu to stop * @cpu2: the other cpu to stop * @fn: function to execute * @arg: argument to @fn * * Stops both the current and specified CPU and runs @fn on one of them. * * returns when both are completed. */ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) { struct cpu_stop_done done; struct cpu_stop_work work1, work2; struct multi_stop_data msdata; msdata = (struct multi_stop_data){ .fn = fn, .data = arg, .num_threads = 2, .active_cpus = cpumask_of(cpu1), }; work1 = work2 = (struct cpu_stop_work){ .fn = multi_cpu_stop, .arg = &msdata, .done = &done, .caller = _RET_IP_, }; cpu_stop_init_done(&done, 2); set_state(&msdata, MULTI_STOP_PREPARE); if (cpu1 > cpu2) swap(cpu1, cpu2); if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) return -ENOENT; wait_for_completion(&done.completion); return done.ret; } /** * stop_one_cpu_nowait - stop a cpu but don't wait for completion * @cpu: cpu to stop * @fn: function to execute * @arg: argument to @fn * @work_buf: pointer to cpu_stop_work structure * * Similar to stop_one_cpu() but doesn't wait for completion. The * caller is responsible for ensuring @work_buf is currently unused * and will remain untouched until stopper starts executing @fn. * * CONTEXT: * Don't care. * * RETURNS: * true if cpu_stop_work was queued successfully and @fn will be called, * false otherwise. */ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; return cpu_stop_queue_work(cpu, work_buf); } static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, struct cpu_stop_done *done) { struct cpu_stop_work *work; unsigned int cpu; bool queued = false; /* * Disable preemption while queueing to avoid getting * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ preempt_disable(); stop_cpus_in_progress = true; barrier(); for_each_cpu(cpu, cpumask) { work = &per_cpu(cpu_stopper.stop_work, cpu); work->fn = fn; work->arg = arg; work->done = done; work->caller = _RET_IP_; if (cpu_stop_queue_work(cpu, work)) queued = true; } barrier(); stop_cpus_in_progress = false; preempt_enable(); return queued; } static int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { struct cpu_stop_done done; cpu_stop_init_done(&done, cpumask_weight(cpumask)); if (!queue_stop_cpus_work(cpumask, fn, arg, &done)) return -ENOENT; wait_for_completion(&done.completion); return done.ret; } /** * stop_cpus - stop multiple cpus * @cpumask: cpus to stop * @fn: function to execute * @arg: argument to @fn * * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu, * @fn is run in a process context with the highest priority * preempting any task on the cpu and monopolizing it. This function * returns after all executions are complete. * * This function doesn't guarantee the cpus in @cpumask stay online * till @fn completes. If some cpus go down in the middle, execution * on the cpu may happen partially or fully on different cpus. @fn * should either be ready for that or the caller should ensure that * the cpus stay online until this function completes. * * All stop_cpus() calls are serialized making it safe for @fn to wait * for all cpus to start executing it. * * CONTEXT: * Might sleep. * * RETURNS: * -ENOENT if @fn(@arg) was not executed at all because all cpus in * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { int ret; /* static works are used, process one request at a time */ mutex_lock(&stop_cpus_mutex); ret = __stop_cpus(cpumask, fn, arg); mutex_unlock(&stop_cpus_mutex); return ret; } static int cpu_stop_should_run(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); unsigned long flags; int run; raw_spin_lock_irqsave(&stopper->lock, flags); run = !list_empty(&stopper->works); raw_spin_unlock_irqrestore(&stopper->lock, flags); return run; } static void cpu_stopper_thread(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stop_work *work; repeat: work = NULL; raw_spin_lock_irq(&stopper->lock); if (!list_empty(&stopper->works)) { work = list_first_entry(&stopper->works, struct cpu_stop_work, list); list_del_init(&work->list); } raw_spin_unlock_irq(&stopper->lock); if (work) { cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; int ret; /* cpu stop callbacks must not sleep, make in_atomic() == T */ stopper->caller = work->caller; stopper->fn = fn; preempt_count_inc(); ret = fn(arg); if (done) { if (ret) done->ret = ret; cpu_stop_signal_done(done); } preempt_count_dec(); stopper->fn = NULL; stopper->caller = 0; WARN_ONCE(preempt_count(), "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; } } void stop_machine_park(int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); /* * Lockless. cpu_stopper_thread() will take stopper->lock and flush * the pending works before it parks, until then it is fine to queue * the new works. */ stopper->enabled = false; kthread_park(stopper->thread); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); static void cpu_stop_create(unsigned int cpu) { sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); } static void cpu_stop_park(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); WARN_ON(!list_empty(&stopper->works)); } void stop_machine_unpark(int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); stopper->enabled = true; kthread_unpark(stopper->thread); } static struct smp_hotplug_thread cpu_stop_threads = { .store = &cpu_stopper.thread, .thread_should_run = cpu_stop_should_run, .thread_fn = cpu_stopper_thread, .thread_comm = "migration/%u", .create = cpu_stop_create, .park = cpu_stop_park, .selfparking = true, }; static int __init cpu_stop_init(void) { unsigned int cpu; for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); raw_spin_lock_init(&stopper->lock); INIT_LIST_HEAD(&stopper->works); } BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); stop_machine_unpark(raw_smp_processor_id()); stop_machine_initialized = true; return 0; } early_initcall(cpu_stop_init); int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, .data = data, .num_threads = num_online_cpus(), .active_cpus = cpus, }; lockdep_assert_cpus_held(); if (!stop_machine_initialized) { /* * Handle the case where stop_machine() is called * early in boot before stop_machine() has been * initialized. */ unsigned long flags; int ret; WARN_ON_ONCE(msdata.num_threads != 1); local_irq_save(flags); hard_irq_disable(); ret = (*fn)(data); local_irq_restore(flags); return ret; } /* Set the initial state and stop all online cpus. */ set_state(&msdata, MULTI_STOP_PREPARE); return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); } int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { int ret; /* No CPUs can come up or down during this. */ cpus_read_lock(); ret = stop_machine_cpuslocked(fn, data, cpus); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); /** * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU * @fn: the function to run * @data: the data ptr for the @fn() * @cpus: the cpus to run the @fn() on (NULL = any online cpu) * * This is identical to stop_machine() but can be called from a CPU which * is not active. The local CPU is in the process of hotplug (so no other * CPU hotplug can start) and not marked active and doesn't have enough * context to sleep. * * This function provides stop_machine() functionality for such state by * using busy-wait for synchronization and executing @fn directly for local * CPU. * * CONTEXT: * Local CPU is inactive. Temporarily stops all active CPUs. * * RETURNS: * 0 if all executions of @fn returned 0, any non zero return value if any * returned non zero. */ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, .data = data, .active_cpus = cpus }; struct cpu_stop_done done; int ret; /* Local CPU must be inactive and CPU hotplug in progress. */ BUG_ON(cpu_active(raw_smp_processor_id())); msdata.num_threads = num_active_cpus() + 1; /* +1 for local */ /* No proper task established and can't sleep - busy wait for lock. */ while (!mutex_trylock(&stop_cpus_mutex)) cpu_relax(); /* Schedule work on other CPUs and execute directly for local CPU */ set_state(&msdata, MULTI_STOP_PREPARE); cpu_stop_init_done(&done, num_active_cpus()); queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, &done); ret = multi_cpu_stop(&msdata); /* Busy wait for completion. */ while (!completion_done(&done.completion)) cpu_relax(); mutex_unlock(&stop_cpus_mutex); return ret ?: done.ret; }
13 13 13 13 157 157 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PTRACE_H #define _LINUX_PTRACE_H #include <linux/compiler.h> /* For unlikely. */ #include <linux/sched.h> /* For struct task_struct. */ #include <linux/sched/signal.h> /* For send_sig(), same_thread_group(), etc. */ #include <linux/err.h> /* for IS_ERR_VALUE */ #include <linux/bug.h> /* For BUG_ON. */ #include <linux/pid_namespace.h> /* For task_active_pid_ns. */ #include <uapi/linux/ptrace.h> #include <linux/seccomp.h> /* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */ struct syscall_info { __u64 sp; struct seccomp_data data; }; extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); /* * Ptrace flags * * The owner ship rules for task->ptrace which holds the ptrace * flags is simple. When a task is running it owns it's task->ptrace * flags. When the a task is stopped the ptracer owns task->ptrace. */ #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ #define PT_EVENT_FLAG(event) (1 << (PT_OPT_FLAG_SHIFT + (event))) #define PT_TRACESYSGOOD PT_EVENT_FLAG(0) #define PT_TRACE_FORK PT_EVENT_FLAG(PTRACE_EVENT_FORK) #define PT_TRACE_VFORK PT_EVENT_FLAG(PTRACE_EVENT_VFORK) #define PT_TRACE_CLONE PT_EVENT_FLAG(PTRACE_EVENT_CLONE) #define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC) #define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE) #define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT) #define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) #define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) extern long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len); extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern void ptrace_disable(struct task_struct *); extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) /** * ptrace_may_access - check whether the caller is permitted to access * a target task. * @task: target task * @mode: selects type of access and caller credentials * * Returns true on success, false on denial. * * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must * be set in @mode to specify whether the access was requested through * a filesystem syscall (should use effective capabilities and fsuid * of the caller) or through an explicit syscall such as * process_vm_writev or ptrace (and should use the real credentials). */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); } static inline void ptrace_unlink(struct task_struct *child) { if (unlikely(child->ptrace)) __ptrace_unlink(child); } int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long data); int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, unsigned long data); /** * ptrace_parent - return the task that is tracing the given task * @task: task to consider * * Returns %NULL if no one is tracing @task, or the &struct task_struct * pointer to its tracer. * * Must called under rcu_read_lock(). The pointer returned might be kept * live only by RCU. During exec, this may be called with task_lock() held * on @task, still held from when check_unsafe_exec() was called. */ static inline struct task_struct *ptrace_parent(struct task_struct *task) { if (unlikely(task->ptrace)) return rcu_dereference(task->parent); return NULL; } /** * ptrace_event_enabled - test whether a ptrace event is enabled * @task: ptracee of interest * @event: %PTRACE_EVENT_* to test * * Test whether @event is enabled for ptracee @task. * * Returns %true if @event is enabled, %false otherwise. */ static inline bool ptrace_event_enabled(struct task_struct *task, int event) { return task->ptrace & PT_EVENT_FLAG(event); } /** * ptrace_event - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @message: value for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @message * to the ptrace parent. * * Called without locks. */ static inline void ptrace_event(int event, unsigned long message) { if (unlikely(ptrace_event_enabled(current, event))) { current->ptrace_message = message; ptrace_notify((event << 8) | SIGTRAP); } else if (event == PTRACE_EVENT_EXEC) { /* legacy EXEC report via SIGTRAP */ if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED) send_sig(SIGTRAP, current, 0); } } /** * ptrace_event_pid - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @pid: process identifier for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @pid * to the ptrace parent. @pid is reported as the pid_t seen from the * ptrace parent's pid namespace. * * Called without locks. */ static inline void ptrace_event_pid(int event, struct pid *pid) { /* * FIXME: There's a potential race if a ptracer in a different pid * namespace than parent attaches between computing message below and * when we acquire tasklist_lock in ptrace_stop(). If this happens, * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. */ unsigned long message = 0; struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(rcu_dereference(current->parent)); if (ns) message = pid_nr_ns(pid, ns); rcu_read_unlock(); ptrace_event(event, message); } /** * ptrace_init_task - initialize ptrace state for a new child * @child: new child task * @ptrace: true if child should be ptrace'd by parent's tracer * * This is called immediately after adding @child to its parent's children * list. @ptrace is false in the normal case, and true to ptrace @child. * * Called with current's siglock and write_lock_irq(&tasklist_lock) held. */ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); else sigaddset(&child->pending.signal, SIGSTOP); } else child->ptracer_cred = NULL; } /** * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped * @task: task in %EXIT_DEAD state * * Called with write_lock(&tasklist_lock) held. */ static inline void ptrace_release_task(struct task_struct *task) { BUG_ON(!list_empty(&task->ptraced)); ptrace_unlink(task); BUG_ON(!list_empty(&task->ptrace_entry)); } #ifndef force_successful_syscall_return /* * System call handlers that, upon successful completion, need to return a * negative value should call force_successful_syscall_return() right before * returning. On architectures where the syscall convention provides for a * separate error flag (e.g., alpha, ia64, ppc{,64}, sparc{,64}, possibly * others), this macro can be used to ensure that the error flag will not get * set. On architectures which do not support a separate error flag, the macro * is a no-op and the spurious error condition needs to be filtered out by some * other means (e.g., in user-level, by passing an extra argument to the * syscall handler, or something along those lines). */ #define force_successful_syscall_return() do { } while (0) #endif #ifndef is_syscall_success /* * On most systems we can tell if a syscall is a success based on if the retval * is an error value. On some systems like ia64 and powerpc they have different * indicators of success/failure and must define their own. */ #define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) #endif /* * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__. * * These do-nothing inlines are used when the arch does not * implement single-step. The kerneldoc comments are here * to document the interface for all arch definitions. */ #ifndef arch_has_single_step /** * arch_has_single_step - does this CPU support user-mode single-step? * * If this is defined, then there must be function declarations or * inlines for user_enable_single_step() and user_disable_single_step(). * arch_has_single_step() should evaluate to nonzero iff the machine * supports instruction single-step for user mode. * It can be a constant or it can test a CPU feature bit. */ #define arch_has_single_step() (0) /** * user_enable_single_step - single-step in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_single_step() has returned nonzero. * Set @task so that when it returns to user mode, it will trap after the * next single instruction executes. If arch_has_block_step() is defined, * this must clear the effects of user_enable_block_step() too. */ static inline void user_enable_single_step(struct task_struct *task) { BUG(); /* This can never be called. */ } /** * user_disable_single_step - cancel user-mode single-step * @task: either current or a task stopped in %TASK_TRACED * * Clear @task of the effects of user_enable_single_step() and * user_enable_block_step(). This can be called whether or not either * of those was ever called on @task, and even if arch_has_single_step() * returned zero. */ static inline void user_disable_single_step(struct task_struct *task) { } #else extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); #endif /* arch_has_single_step */ #ifndef arch_has_block_step /** * arch_has_block_step - does this CPU support user-mode block-step? * * If this is defined, then there must be a function declaration or inline * for user_enable_block_step(), and arch_has_single_step() must be defined * too. arch_has_block_step() should evaluate to nonzero iff the machine * supports step-until-branch for user mode. It can be a constant or it * can test a CPU feature bit. */ #define arch_has_block_step() (0) /** * user_enable_block_step - step until branch in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_block_step() has returned nonzero, * and will never be called when single-instruction stepping is being used. * Set @task so that when it returns to user mode, it will trap after the * next branch or trap taken. */ static inline void user_enable_block_step(struct task_struct *task) { BUG(); /* This can never be called. */ } #else extern void user_enable_block_step(struct task_struct *); #endif /* arch_has_block_step */ #ifdef ARCH_HAS_USER_SINGLE_STEP_REPORT extern void user_single_step_report(struct pt_regs *regs); #else static inline void user_single_step_report(struct pt_regs *regs) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = SI_USER; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } #endif #ifndef arch_ptrace_stop_needed /** * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called * @code: current->exit_code value ptrace will stop with * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with the siglock held, to decide whether or not it's * necessary to release the siglock and call arch_ptrace_stop() with the * same @code and @info arguments. It can be defined to a constant if * arch_ptrace_stop() is never required, or always is. On machines where * this makes sense, it should be defined to a quick test to optimize out * calling arch_ptrace_stop() when it would be superfluous. For example, * if the thread has not been back to user mode since the last stop, the * thread state might indicate that nothing needs to be done. * * This is guaranteed to be invoked once before a task stops for ptrace and * may include arch-specific operations necessary prior to a ptrace stop. */ #define arch_ptrace_stop_needed(code, info) (0) #endif #ifndef arch_ptrace_stop /** * arch_ptrace_stop - Do machine-specific work before stopping for ptrace * @code: current->exit_code value ptrace will stop with * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with no locks held when arch_ptrace_stop_needed() has * just returned nonzero. It is allowed to block, e.g. for user memory * access. The arch can have machine-specific work to be done before * ptrace stops. On ia64, register backing store gets written back to user * memory here. Since this can be costly (requires dropping the siglock), * we only do it when the arch requires it for this particular stop, as * indicated by arch_ptrace_stop_needed(). */ #define arch_ptrace_stop(code, info) do { } while (0) #endif #ifndef current_pt_regs #define current_pt_regs() task_pt_regs(current) #endif /* * unlike current_pt_regs(), this one is equal to task_pt_regs(current) * on *all* architectures; the only reason to have a per-arch definition * is optimisation. */ #ifndef signal_pt_regs #define signal_pt_regs() task_pt_regs(current) #endif #ifndef current_user_stack_pointer #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); #endif
2 1 1 1 1 1 2 15 10 1 9 2 1 2 1 2 1 2 1 3 1 1 1 1 90 90 1 33 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 // SPDX-License-Identifier: GPL-2.0-or-later /* * Ioctl handler * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/if_bridge.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/times.h> #include <net/net_namespace.h> #include <linux/uaccess.h> #include "br_private.h" static int get_bridge_ifindices(struct net *net, int *indices, int num) { struct net_device *dev; int i = 0; rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (i >= num) break; if (dev->priv_flags & IFF_EBRIDGE) indices[i++] = dev->ifindex; } rcu_read_unlock(); return i; } /* called with RTNL */ static void get_port_ifindices(struct net_bridge *br, int *ifindices, int num) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->port_no < num) ifindices[p->port_no] = p->dev->ifindex; } } /* * Format up to a page worth of forwarding table entries * userbuf -- where to copy result * maxnum -- maximum number of entries desired * (limited to a page for sanity) * offset -- number of records to skip */ static int get_fdb_entries(struct net_bridge *br, void __user *userbuf, unsigned long maxnum, unsigned long offset) { int num; void *buf; size_t size; /* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */ if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry)) maxnum = PAGE_SIZE/sizeof(struct __fdb_entry); size = maxnum * sizeof(struct __fdb_entry); buf = kmalloc(size, GFP_USER); if (!buf) return -ENOMEM; num = br_fdb_fillbuf(br, buf, maxnum, offset); if (num > 0) { if (copy_to_user(userbuf, buf, array_size(num, sizeof(struct __fdb_entry)))) num = -EFAULT; } kfree(buf); return num; } /* called with RTNL */ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) { struct net *net = dev_net(br->dev); struct net_device *dev; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; dev = __dev_get_by_index(net, ifindex); if (dev == NULL) return -EINVAL; if (isadd) ret = br_add_if(br, dev, NULL); else ret = br_del_if(br, dev); return ret; } /* * Legacy ioctl's through SIOCDEVPRIVATE * This interface is deprecated because it was too difficult * to do the translation for 32/64bit ioctl compatibility. */ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p = NULL; unsigned long args[4]; void __user *argp; int ret = -EOPNOTSUPP; if (in_compat_syscall()) { unsigned int cargs[4]; if (copy_from_user(cargs, data, sizeof(cargs))) return -EFAULT; args[0] = cargs[0]; args[1] = cargs[1]; args[2] = cargs[2]; args[3] = cargs[3]; argp = compat_ptr(args[1]); } else { if (copy_from_user(args, data, sizeof(args))) return -EFAULT; argp = (void __user *)args[1]; } switch (args[0]) { case BRCTL_ADD_IF: case BRCTL_DEL_IF: return add_del_if(br, args[1], args[0] == BRCTL_ADD_IF); case BRCTL_GET_BRIDGE_INFO: { struct __bridge_info b; memset(&b, 0, sizeof(struct __bridge_info)); rcu_read_lock(); memcpy(&b.designated_root, &br->designated_root, 8); memcpy(&b.bridge_id, &br->bridge_id, 8); b.root_path_cost = br->root_path_cost; b.max_age = jiffies_to_clock_t(br->max_age); b.hello_time = jiffies_to_clock_t(br->hello_time); b.forward_delay = br->forward_delay; b.bridge_max_age = br->bridge_max_age; b.bridge_hello_time = br->bridge_hello_time; b.bridge_forward_delay = jiffies_to_clock_t(br->bridge_forward_delay); b.topology_change = br->topology_change; b.topology_change_detected = br->topology_change_detected; b.root_port = br->root_port; b.stp_enabled = (br->stp_enabled != BR_NO_STP); b.ageing_time = jiffies_to_clock_t(br->ageing_time); b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); b.gc_timer_value = br_timer_value(&br->gc_work.timer); rcu_read_unlock(); if (copy_to_user((void __user *)args[1], &b, sizeof(b))) return -EFAULT; return 0; } case BRCTL_GET_PORT_LIST: { int num, *indices; num = args[2]; if (num < 0) return -EINVAL; if (num == 0) num = 256; if (num > BR_MAX_PORTS) num = BR_MAX_PORTS; indices = kcalloc(num, sizeof(int), GFP_KERNEL); if (indices == NULL) return -ENOMEM; get_port_ifindices(br, indices, num); if (copy_to_user(argp, indices, array_size(num, sizeof(int)))) num = -EFAULT; kfree(indices); return num; } case BRCTL_SET_BRIDGE_FORWARD_DELAY: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_forward_delay(br, args[1]); break; case BRCTL_SET_BRIDGE_HELLO_TIME: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_hello_time(br, args[1]); break; case BRCTL_SET_BRIDGE_MAX_AGE: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_max_age(br, args[1]); break; case BRCTL_SET_AGEING_TIME: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_ageing_time(br, args[1]); break; case BRCTL_GET_PORT_INFO: { struct __port_info p; struct net_bridge_port *pt; rcu_read_lock(); if ((pt = br_get_port(br, args[2])) == NULL) { rcu_read_unlock(); return -EINVAL; } memset(&p, 0, sizeof(struct __port_info)); memcpy(&p.designated_root, &pt->designated_root, 8); memcpy(&p.designated_bridge, &pt->designated_bridge, 8); p.port_id = pt->port_id; p.designated_port = pt->designated_port; p.path_cost = pt->path_cost; p.designated_cost = pt->designated_cost; p.state = pt->state; p.top_change_ack = pt->topology_change_ack; p.config_pending = pt->config_pending; p.message_age_timer_value = br_timer_value(&pt->message_age_timer); p.forward_delay_timer_value = br_timer_value(&pt->forward_delay_timer); p.hold_timer_value = br_timer_value(&pt->hold_timer); rcu_read_unlock(); if (copy_to_user(argp, &p, sizeof(p))) return -EFAULT; return 0; } case BRCTL_SET_BRIDGE_STP_STATE: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_stp_set_enabled(br, args[1], NULL); break; case BRCTL_SET_BRIDGE_PRIORITY: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; br_stp_set_bridge_priority(br, args[1]); ret = 0; break; case BRCTL_SET_PORT_PRIORITY: { if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; spin_lock_bh(&br->lock); if ((p = br_get_port(br, args[1])) == NULL) ret = -EINVAL; else ret = br_stp_set_port_priority(p, args[2]); spin_unlock_bh(&br->lock); break; } case BRCTL_SET_PATH_COST: { if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; spin_lock_bh(&br->lock); if ((p = br_get_port(br, args[1])) == NULL) ret = -EINVAL; else ret = br_stp_set_path_cost(p, args[2]); spin_unlock_bh(&br->lock); break; } case BRCTL_GET_FDB_ENTRIES: return get_fdb_entries(br, argp, args[2], args[3]); } if (!ret) { if (p) br_ifinfo_notify(RTM_NEWLINK, NULL, p); else netdev_state_change(br->dev); } return ret; } static int old_deviceless(struct net *net, void __user *uarg) { unsigned long args[3]; if (copy_from_user(args, uarg, sizeof(args))) return -EFAULT; switch (args[0]) { case BRCTL_GET_VERSION: return BRCTL_VERSION; case BRCTL_GET_BRIDGES: { int *indices; int ret = 0; if (args[2] >= 2048) return -ENOMEM; indices = kcalloc(args[2], sizeof(int), GFP_KERNEL); if (indices == NULL) return -ENOMEM; args[2] = get_bridge_ifindices(net, indices, args[2]); ret = copy_to_user((void __user *)args[1], indices, array_size(args[2], sizeof(int))) ? -EFAULT : args[2]; kfree(indices); return ret; } case BRCTL_ADD_BRIDGE: case BRCTL_DEL_BRIDGE: { char buf[IFNAMSIZ]; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(buf, (void __user *)args[1], IFNAMSIZ)) return -EFAULT; buf[IFNAMSIZ-1] = 0; if (args[0] == BRCTL_ADD_BRIDGE) return br_add_bridge(net, buf); return br_del_bridge(net, buf); } } return -EOPNOTSUPP; } int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg) { int ret = -EOPNOTSUPP; rtnl_lock(); switch (cmd) { case SIOCGIFBR: case SIOCSIFBR: ret = old_deviceless(net, uarg); break; case SIOCBRADDBR: case SIOCBRDELBR: { char buf[IFNAMSIZ]; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } if (copy_from_user(buf, uarg, IFNAMSIZ)) { ret = -EFAULT; break; } buf[IFNAMSIZ-1] = 0; if (cmd == SIOCBRADDBR) ret = br_add_bridge(net, buf); else ret = br_del_bridge(net, buf); } break; case SIOCBRADDIF: case SIOCBRDELIF: ret = add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF); break; } rtnl_unlock(); return ret; }
208 210 210 9 209 205 204 203 204 205 206 206 1 204 206 206 1 205 206 206 202 201 7 208 9 5 5 9 2 205 203 204 206 209 208 211 211 210 205 210 207 210 22 209 210 206 205 5 4 205 210 210 5 208 206 205 206 9 208 208 209 205 208 206 206 205 203 202 205 204 29 206 205 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Modified to make sys_syslog() more flexible: added commands to * return the last 4k of kernel messages, regardless of whether * they've been read or not. Added option to suppress kernel printk's * to the console. Added hook for sending the console messages * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/console.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/crash_core.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/uaccess.h> #include <asm/sections.h> #include <trace/events/initcall.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> #include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. */ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* * console_sem protects the console_drivers list, and also * provides serialisation for access to the entire console * driver system. */ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); /* * System may need to suppress printk message under certain * circumstances, like after kernel panic happens. */ int __read_mostly suppress_printk; #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; #endif enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, __DEVKMSG_LOG_BIT_LOCK, }; enum devkmsg_log_masks { DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), }; /* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ #define DEVKMSG_LOG_MASK_DEFAULT 0 static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; static int __control_devkmsg(char *str) { size_t len; if (!str) return -EINVAL; len = str_has_prefix(str, "on"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_ON; return len; } len = str_has_prefix(str, "off"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_OFF; return len; } len = str_has_prefix(str, "ratelimit"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; return len; } return -EINVAL; } static int __init control_devkmsg(char *str) { if (__control_devkmsg(str) < 0) { pr_warn("printk.devkmsg: bad option string '%s'\n", str); return 1; } /* * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) strcpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) strcpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* * Sysctl cannot change it anymore. The kernel command line setting of * this parameter is to force the setting to be permanent throughout the * runtime of the system. This is a precation measure against userspace * trying to be a smarta** and attempting to change it up on us. */ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; return 1; } __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; int err; if (write) { if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) return -EINVAL; old = devkmsg_log; strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE); } err = proc_dostring(table, write, buffer, lenp, ppos); if (err) return err; if (write) { err = __control_devkmsg(devkmsg_log_str); /* * Do not accept an unknown string OR a known string with * trailing crap... */ if (err < 0 || (err + 1 != *lenp)) { /* ... and restore old setting. */ devkmsg_log = old; strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE); return -EINVAL; } } return 0; } /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ #define down_console_sem() do { \ down(&console_sem);\ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ } while (0) static int __down_trylock_console_sem(unsigned long ip) { int lock_failed; unsigned long flags; /* * Here and in __up_console_sem() we need to be in safe mode, * because spindump/WARN/etc from under console ->lock will * deadlock in printk()->down_trylock_console_sem() otherwise. */ printk_safe_enter_irqsave(flags); lock_failed = down_trylock(&console_sem); printk_safe_exit_irqrestore(flags); if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) static void __up_console_sem(unsigned long ip) { unsigned long flags; mutex_release(&console_lock_dep_map, ip); printk_safe_enter_irqsave(flags); up(&console_sem); printk_safe_exit_irqrestore(flags); } #define up_console_sem() __up_console_sem(_RET_IP_) /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ * hold it and are racing, but it helps tracking those weird code * paths in the console code where we end up in places I want * locked without the console semaphore held). */ static int console_locked, console_suspended; /* * If exclusive_console is non-NULL then only this console is to be printed to. */ static struct console *exclusive_console; /* * Array of consoles built from command line options (console=) */ #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; static bool has_preferred_console; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; enum con_msg_format_flags { MSG_FORMAT_DEFAULT = 0, MSG_FORMAT_SYSLOG = (1 << 0), }; static int console_msg_format = MSG_FORMAT_DEFAULT; /* * The printk log buffer consists of a sequenced collection of records, each * containing variable length message text. Every record also contains its * own meta-data (@info). * * Every record meta-data carries the timestamp in microseconds, as well as * the standard userspace syslog level and syslog facility. The usual kernel * messages use LOG_KERN; userspace-injected messages always carry a matching * syslog facility, by default LOG_USER. The origin of every message can be * reliably determined that way. * * The human readable log message of a record is available in @text, the * length of the message text in @text_len. The stored message is not * terminated. * * Optionally, a record can carry a dictionary of properties (key/value * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier * b12:8 block dev_t * c127:3 char dev_t * n8 netdev ifindex * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * * Valid characters in property names are [a-zA-Z0-9.-_]. Property names * and values are terminated by a '\0' character. * * Example of record values: * record.text_buf = "it's a line" (unterminated) * record.info.seq = 56 * record.info.ts_nsec = 36863 * record.info.text_len = 11 * record.info.facility = 0 (LOG_KERN) * record.info.flags = 0 * record.info.level = 3 (LOG_ERR) * record.info.caller_id = 299 (task 299) * record.info.dev_info.subsystem = "pci" (terminated) * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n" * * Users of the export format should ignore possible additional values * separated by ',', and find the message after the ';' character. * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible * non-prinatable characters are escaped in the "\xff" notation. */ /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; /* All 3 protected by @console_sem. */ /* the next printk record to write to the console */ static u64 console_seq; static u64 exclusive_console_stop_seq; static unsigned long console_dropped; struct latched_seq { seqcount_latch_t latch; u64 val[2]; }; /* * The next printk record to read after the last 'clear' command. There are * two copies (updated with seqcount_latch) so that reads can locklessly * access a valid value. Writers are synchronized by @syslog_lock. */ static struct latched_seq clear_seq = { .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), .val[0] = 0, .val[1] = 0, }; #ifdef CONFIG_PRINTK_CALLER #define PREFIX_MAX 48 #else #define PREFIX_MAX 32 #endif /* the maximum size of a formatted record (i.e. with prefix added per line) */ #define CONSOLE_LOG_MAX 1024 /* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) #define LOG_LEVEL(v) ((v) & 0x07) #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ #define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; /* * Define the average message size. This only affects the number of * descriptors that will be available. Underestimating is better than * overestimating (too many available descriptors is better than not enough). */ #define PRB_AVGBITS 5 /* 32 character average length */ #if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS #error CONFIG_LOG_BUF_SHIFT value too small. #endif _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); static struct printk_ringbuffer printk_rb_dynamic; static struct printk_ringbuffer *prb = &printk_rb_static; /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. */ static bool __printk_percpu_data_ready __read_mostly; bool printk_percpu_data_ready(void) { return __printk_percpu_data_ready; } /* Must be called under syslog_lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { raw_write_seqcount_latch(&ls->latch); ls->val[0] = val; raw_write_seqcount_latch(&ls->latch); ls->val[1] = val; } /* Can be called from any context. */ static u64 latched_seq_read_nolock(struct latched_seq *ls) { unsigned int seq; unsigned int idx; u64 val; do { seq = raw_read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; } while (read_seqcount_latch_retry(&ls->latch, seq)); return val; } /* Return log buffer address */ char *log_buf_addr_get(void) { return log_buf; } /* Return log buffer size */ u32 log_buf_len_get(void) { return log_buf_len; } /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available * when the index points to the middle. */ #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = "<truncated>"; static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; if (*text_len > max_text_len) *text_len = max_text_len; /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); if (*text_len >= *trunc_msg_len) *text_len -= *trunc_msg_len; else *trunc_msg_len = 0; } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { if (dmesg_restrict) return 1; /* * Unless restricted, we allow "read all" and "get buffer size" * for everybody. */ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; } static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) goto ok; /* * For historical reasons, accept CAP_SYS_ADMIN too, with * a warning. */ if (capable(CAP_SYS_ADMIN)) { pr_warn_once("%s (%d): Attempt to access syslog with " "CAP_SYS_ADMIN but no CAP_SYSLOG " "(deprecated).\n", current->comm, task_pid_nr(current)); goto ok; } return -EPERM; } ok: return security_syslog(type); } static void append_char(char **pp, char *e, char c) { if (*pp < e) *(*pp)++ = c; } static ssize_t info_print_ext_header(char *buf, size_t size, struct printk_info *info) { u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); #else caller[0] = '\0'; #endif do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", (info->facility << 3) | info->level, info->seq, ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_add_ext_text(char *buf, size_t size, const char *text, size_t text_len, unsigned char endc) { char *p = buf, *e = buf + size; size_t i; /* escape non-printable characters */ for (i = 0; i < text_len; i++) { unsigned char c = text[i]; if (c < ' ' || c >= 127 || c == '\\') p += scnprintf(p, e - p, "\\x%02x", c); else append_char(&p, e, c); } append_char(&p, e, endc); return p - buf; } static ssize_t msg_add_dict_text(char *buf, size_t size, const char *key, const char *val) { size_t val_len = strlen(val); ssize_t len; if (!val_len) return 0; len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); return len; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { ssize_t len; len = msg_add_ext_text(buf, size, text, text_len, '\n'); if (!dev_info) goto out; len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", dev_info->subsystem); len += msg_add_dict_text(buf + len, size - len, "DEVICE", dev_info->device); out: return len; } /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; struct printk_info info; char text_buf[CONSOLE_EXT_LOG_MAX]; struct printk_record record; }; static __printf(3, 4) __cold int devkmsg_emit(int facility, int level, const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; } static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ struct file *file = iocb->ki_filp; struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; if (!user || len > LOG_LINE_MAX) return -EINVAL; /* Ignore when user logging is disabled. */ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return len; /* Ratelimit when not explicitly enabled. */ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { if (!___ratelimit(&user->rs, current->comm)) return ret; } buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; buf[len] = '\0'; if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } /* * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace * the decimal value represents 32bit, the lower 3 bit are the log * level, the rest are the log facility. * * If no prefix or no userspace facility is specified, we * enforce LOG_USER, to be able to reliably distinguish * kernel-generated messages from userspace-injected ones. */ line = buf; if (line[0] == '<') { char *endp = NULL; unsigned int u; u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { level = LOG_LEVEL(u); if (LOG_FACILITY(u) != 0) facility = LOG_FACILITY(u); endp++; line = endp; } } devkmsg_emit(facility, level, "%s", line); kfree(buf); return ret; } static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; struct printk_record *r = &user->record; size_t len; ssize_t ret; if (!user) return -EBADF; ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; } /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ ret = wait_event_interruptible(log_wait, prb_read_valid(prb, atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } if (r->info->seq != atomic64_read(&user->seq)) { /* our last seen message is gone, return error and reset */ atomic64_set(&user->seq, r->info->seq); ret = -EPIPE; goto out; } len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, &r->text_buf[0], r->info->text_len, &r->info->dev_info); atomic64_set(&user->seq, r->info->seq + 1); if (len > count) { ret = -EINVAL; goto out; } if (copy_to_user(buf, user->buf, len)) { ret = -EFAULT; goto out; } ret = len; out: mutex_unlock(&user->lock); return ret; } /* * Be careful when modifying this function!!! * * Only few operations are supported because the device works only with the * entire variable length messages (records). Non-standard values are * returned in the other cases and has been this way for quite some time. * User space applications might depend on this behavior. */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; loff_t ret = 0; if (!user) return -EBADF; if (offset) return -ESPIPE; switch (whence) { case SEEK_SET: /* the first record */ atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* * The first record after the last SYSLOG_ACTION_CLEAR, * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; } return ret; } static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; struct printk_info info; __poll_t ret = 0; if (!user) return EPOLLERR|EPOLLNVAL; poll_wait(file, &log_wait, wait); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; } return ret; } static int devkmsg_open(struct inode *inode, struct file *file) { struct devkmsg_user *user; int err; if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return -EPERM; /* write-only does not need any file context */ if ((file->f_flags & O_ACCMODE) != O_WRONLY) { err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, SYSLOG_FROM_READER); if (err) return err; } user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; ratelimit_default_init(&user->rs); ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); mutex_init(&user->lock); prb_rec_init_rd(&user->record, &user->info, &user->text_buf[0], sizeof(user->text_buf)); atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; return 0; } static int devkmsg_release(struct inode *inode, struct file *file) { struct devkmsg_user *user = file->private_data; if (!user) return 0; ratelimit_state_exit(&user->rs); mutex_destroy(&user->lock); kfree(user); return 0; } const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, }; #ifdef CONFIG_CRASH_CORE /* * This appends the listed symbols to /proc/vmcore * * /proc/vmcore is used by various utilities, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ void log_buf_vmcoreinfo_setup(void) { struct dev_printk_info *dev_info = NULL; VMCOREINFO_SYMBOL(prb); VMCOREINFO_SYMBOL(printk_rb_static); VMCOREINFO_SYMBOL(clear_seq); /* * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); VMCOREINFO_OFFSET(printk_ringbuffer, fail); VMCOREINFO_STRUCT_SIZE(prb_desc_ring); VMCOREINFO_OFFSET(prb_desc_ring, count_bits); VMCOREINFO_OFFSET(prb_desc_ring, descs); VMCOREINFO_OFFSET(prb_desc_ring, infos); VMCOREINFO_OFFSET(prb_desc_ring, head_id); VMCOREINFO_OFFSET(prb_desc_ring, tail_id); VMCOREINFO_STRUCT_SIZE(prb_desc); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); VMCOREINFO_OFFSET(prb_data_blk_lpos, next); VMCOREINFO_STRUCT_SIZE(printk_info); VMCOREINFO_OFFSET(printk_info, seq); VMCOREINFO_OFFSET(printk_info, ts_nsec); VMCOREINFO_OFFSET(printk_info, text_len); VMCOREINFO_OFFSET(printk_info, caller_id); VMCOREINFO_OFFSET(printk_info, dev_info); VMCOREINFO_STRUCT_SIZE(dev_printk_info); VMCOREINFO_OFFSET(dev_printk_info, subsystem); VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); VMCOREINFO_OFFSET(dev_printk_info, device); VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); VMCOREINFO_STRUCT_SIZE(prb_data_ring); VMCOREINFO_OFFSET(prb_data_ring, size_bits); VMCOREINFO_OFFSET(prb_data_ring, data); VMCOREINFO_OFFSET(prb_data_ring, head_lpos); VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); VMCOREINFO_SIZE(atomic_long_t); VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); VMCOREINFO_STRUCT_SIZE(latched_seq); VMCOREINFO_OFFSET(latched_seq, val); } #endif /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ static void __init log_buf_len_update(u64 size) { if (size > (u64)LOG_BUF_LEN_MAX) { size = (u64)LOG_BUF_LEN_MAX; pr_err("log_buf over 2G is not supported.\n"); } if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { u64 size; if (!str) return -EINVAL; size = memparse(str, &str); log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); #ifdef CONFIG_SMP #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static void __init log_buf_add_cpu(void) { unsigned int cpu_extra; /* * archs should set up cpu_possible_bits properly with * set_cpu_possible() after setup_arch() but just in * case lets ensure this is valid. */ if (num_possible_cpus() == 1) return; cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; /* by default this will only continue through for large > 64 CPUs */ if (cpu_extra <= __LOG_BUF_LEN / 2) return; pr_info("log_buf_len individual max cpu contribution: %d bytes\n", __LOG_CPU_MAX_BUF_LEN); pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", cpu_extra); pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); log_buf_len_update(cpu_extra + __LOG_BUF_LEN); } #else /* !CONFIG_SMP */ static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ static void __init set_percpu_data_ready(void) { __printk_percpu_data_ready = true; } static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_reserved_entry e; struct printk_record dest_r; prb_rec_init_wr(&dest_r, r->info->text_len); if (!prb_reserve(&e, rb, &dest_r)) return 0; memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); dest_r.info->text_len = r->info->text_len; dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); return prb_record_text_space(&e); } static char setup_text_buf[LOG_LINE_MAX] __initdata; void __init setup_log_buf(int early) { struct printk_info *new_infos; unsigned int new_descs_count; struct prb_desc *new_descs; struct printk_info info; struct printk_record r; unsigned int text_size; size_t new_descs_size; size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very * early, e.g. from setup_arch(), and second - when percpu_areas * are initialised. */ if (!early) set_percpu_data_ready(); if (log_buf != __log_buf) return; if (!early && !new_log_buf_len) log_buf_add_cpu(); if (!new_log_buf_len) return; new_descs_count = new_log_buf_len >> PRB_AVGBITS; if (new_descs_count == 0) { pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); return; } new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu text bytes not available\n", new_log_buf_len); return; } new_descs_size = new_descs_count * sizeof(struct prb_desc); new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); goto err_free_log_buf; } new_infos_size = new_descs_count * sizeof(struct printk_info); new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); if (unlikely(!new_infos)) { pr_err("log_buf_len: %zu info bytes not available\n", new_infos_size); goto err_free_descs; } prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), new_descs, ilog2(new_descs_count), new_infos); local_irq_save(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN; prb_for_each_record(0, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } prb = &printk_rb_dynamic; local_irq_restore(flags); /* * Copy any remaining messages that might have appeared from * NMI context after copying but before switching to the * dynamic buffer. */ prb_for_each_record(seq, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", prb_next_seq(&printk_rb_static) - seq); } pr_info("log_buf_len: %u bytes\n", log_buf_len); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); return; err_free_descs: memblock_free_ptr(new_descs, new_descs_size); err_free_log_buf: memblock_free_ptr(new_log_buf, new_log_buf_len); } static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; } early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); static bool suppress_message_printing(int level) { return (level >= console_loglevel && !ignore_loglevel); } #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; get_option(&str, &boot_delay); if (boot_delay > 10 * 1000) boot_delay = 0; pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 0; } early_param("boot_delay", boot_delay_setup); static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress_message_printing(level)) { return; } k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { k--; cpu_relax(); /* * use (volatile) jiffies to prevent * compiler reduction; loop termination via jiffies * is secondary and may or may not happen. */ if (time_after(jiffies, timeout)) break; touch_nmi_watchdog(); } } #else static inline void boot_delay_msec(int level) { } #endif static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_syslog(unsigned int level, char *buf) { return sprintf(buf, "<%u>", level); } static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } #ifdef CONFIG_PRINTK_CALLER static size_t print_caller(u32 id, char *buf) { char caller[12]; snprintf(caller, sizeof(caller), "%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); return sprintf(buf, "[%6s]", caller); } #else #define print_caller(id, buf) 0 #endif static size_t info_print_prefix(const struct printk_info *info, bool syslog, bool time, char *buf) { size_t len = 0; if (syslog) len = print_syslog((info->facility << 3) | info->level, buf); if (time) len += print_time(info->ts_nsec, buf + len); len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; buf[len] = '\0'; } return len; } /* * Prepare the record for printing. The text is shifted within the given * buffer to avoid a need for another one. The following operations are * done: * * - Add prefix for each line. * - Drop truncated lines that no longer fit into the buffer. * - Add the trailing newline that has been removed in vprintk_store(). * - Add a string terminator. * * Since the produced string is always terminated, the maximum possible * return value is @r->text_buf_size - 1; * * Return: The length of the updated/prepared text, including the added * prefixes and the newline. The terminator is not counted. The dropped * line(s) are not counted. */ static size_t record_print_text(struct printk_record *r, bool syslog, bool time) { size_t text_len = r->info->text_len; size_t buf_size = r->text_buf_size; char *text = r->text_buf; char prefix[PREFIX_MAX]; bool truncated = false; size_t prefix_len; size_t line_len; size_t len = 0; char *next; /* * If the message was truncated because the buffer was not large * enough, treat the available text as if it were the full text. */ if (text_len > buf_size) text_len = buf_size; prefix_len = info_print_prefix(r->info, syslog, time, prefix); /* * @text_len: bytes of unprocessed text * @line_len: bytes of current line _without_ newline * @text: pointer to beginning of current line * @len: number of bytes prepared in r->text_buf */ for (;;) { next = memchr(text, '\n', text_len); if (next) { line_len = next - text; } else { /* Drop truncated line(s). */ if (truncated) break; line_len = text_len; } /* * Truncate the text if there is not enough space to add the * prefix and a trailing newline and a terminator. */ if (len + prefix_len + text_len + 1 + 1 > buf_size) { /* Drop even the current line if no space. */ if (len + prefix_len + line_len + 1 + 1 > buf_size) break; text_len = buf_size - len - prefix_len - 1 - 1; truncated = true; } memmove(text + prefix_len, text, text_len); memcpy(text, prefix, prefix_len); /* * Increment the prepared length to include the text and * prefix that were just moved+copied. Also increment for the * newline at the end of this line. If this is the last line, * there is no newline, but it will be added immediately below. */ len += prefix_len + line_len + 1; if (text_len == line_len) { /* * This is the last line. Add the trailing newline * removed in vprintk_store(). */ text[prefix_len + line_len] = '\n'; break; } /* * Advance beyond the added prefix and the related line with * its newline. */ text += prefix_len + line_len + 1; /* * The remaining text has only decreased by the line with its * newline. * * Note that @text_len can become zero. It happens when @text * ended with a newline (either due to truncation or the * original string ending with "\n\n"). The loop is correctly * repeated and (if not truncated) an empty line with a prefix * will be prepared. */ text_len -= line_len + 1; } /* * If a buffer was provided, it will be terminated. Space for the * string terminator is guaranteed to be available. The terminator is * not counted in the return value. */ if (buf_size > 0) r->text_buf[len] = 0; return len; } static size_t get_record_print_text_size(struct printk_info *info, unsigned int line_count, bool syslog, bool time) { char prefix[PREFIX_MAX]; size_t prefix_len; prefix_len = info_print_prefix(info, syslog, time, prefix); /* * Each line will be preceded with a prefix. The intermediate * newlines are already within the text, but a final trailing * newline will be added. */ return ((prefix_len * line_count) + info->text_len + 1); } /* * Beginning with @start_seq, find the first record where it and all following * records up to (but not including) @max_seq fit into @size. * * @max_seq is simply an upper bound and does not need to exist. If the caller * does not require an upper bound, -1 can be used for @max_seq. */ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, bool syslog, bool time) { struct printk_info info; unsigned int line_count; size_t len = 0; u64 seq; /* Determine the size of the records up to @max_seq. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (info.seq >= max_seq) break; len += get_record_print_text_size(&info, line_count, syslog, time); } /* * Adjust the upper bound for the next loop to avoid subtracting * lengths that were never added. */ if (seq < max_seq) max_seq = seq; /* * Move first record forward until length fits into the buffer. Ignore * newest messages that were not counted in the above cycle. Messages * might appear and get lost in the meantime. This is a best effort * that prevents an infinite loop that could occur with a retry. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (len <= size || info.seq >= max_seq) break; len -= get_record_print_text_size(&info, line_count, syslog, time); } return seq; } /* The caller is responsible for making sure @size is greater than 0. */ static int syslog_print(char __user *buf, int size) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); if (!text) return -ENOMEM; prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); mutex_lock(&syslog_lock); /* * Wait for the @syslog_seq record to be available. @syslog_seq may * change while waiting. */ do { seq = syslog_seq; mutex_unlock(&syslog_lock); /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) goto out; } while (syslog_seq != seq); /* * Copy records that fit into the buffer. The above cycle makes sure * that the first record is always available. */ do { size_t n; size_t skip; int err; if (!prb_read_valid(prb, syslog_seq, &r)) break; if (r.info->seq != syslog_seq) { /* message is gone, move to next valid one */ syslog_seq = r.info->seq; syslog_partial = 0; } /* * To keep reading/counting partial line consistent, * use printk_time value as of the beginning of a line. */ if (!syslog_partial) syslog_time = printk_time; skip = syslog_partial; n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ /* partial read(), remember position */ n = size; syslog_partial += n; } else n = 0; if (!n) break; mutex_unlock(&syslog_lock); err = copy_to_user(buf, text + skip, n); mutex_lock(&syslog_lock); if (err) { if (!len) len = -EFAULT; break; } len += n; size -= n; buf += n; } while (size); out: mutex_unlock(&syslog_lock); kfree(text); return len; } static int syslog_print_all(char __user *buf, int size, bool clear) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; bool time; text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); if (!text) return -ENOMEM; time = printk_time; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, size, true, time); prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); len = 0; prb_for_each_record(seq, prb, seq, &r) { int textlen; textlen = record_print_text(&r, true, time); if (len + textlen > size) { seq--; break; } if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; if (len < 0) break; } if (clear) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, seq); mutex_unlock(&syslog_lock); } kfree(text); return len; } static void syslog_clear(void) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); mutex_unlock(&syslog_lock); } int do_syslog(int type, char __user *buf, int len, int source) { struct printk_info info; bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, source); if (error) return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: if (len < 1 || len > 8) return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: mutex_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ mutex_unlock(&syslog_lock); return 0; } if (info.seq != syslog_seq) { /* messages are gone, move to first one */ syslog_seq = info.seq; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of * records, not the length. */ error = prb_next_seq(prb) - syslog_seq; } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { error += get_record_print_text_size(&info, line_count, true, time); time = printk_time; } error -= syslog_partial; } mutex_unlock(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* * Special console_lock variants that help to reduce the risk of soft-lockups. * They allow to pass console_lock to another printk() call using a busy wait. */ #ifdef CONFIG_LOCKDEP static struct lockdep_map console_owner_dep_map = { .name = "console_owner" }; #endif static DEFINE_RAW_SPINLOCK(console_owner_lock); static struct task_struct *console_owner; static bool console_waiter; /** * console_lock_spinning_enable - mark beginning of code where another * thread might safely busy wait * * This basically converts console_lock into a spinlock. This marks * the section where the console_lock owner can not sleep, because * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ static void console_lock_spinning_enable(void) { raw_spin_lock(&console_owner_lock); console_owner = current; raw_spin_unlock(&console_owner_lock); /* The waiter may spin on us after setting console_owner */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); } /** * console_lock_spinning_disable_and_check - mark end of code where another * thread was able to busy wait and check if there is a waiter * * This is called at the end of the section where spinning is allowed. * It has two functions. First, it is a signal that it is no longer * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * * Important: Callers lose the lock if there was a busy waiter. * They must not touch items synchronized by console_lock * in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */ static int console_lock_spinning_disable_and_check(void) { int waiter; raw_spin_lock(&console_owner_lock); waiter = READ_ONCE(console_waiter); console_owner = NULL; raw_spin_unlock(&console_owner_lock); if (!waiter) { spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } /* The waiter is now free to continue */ WRITE_ONCE(console_waiter, false); spin_release(&console_owner_dep_map, _THIS_IP_); /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ mutex_release(&console_lock_dep_map, _THIS_IP_); return 1; } /** * console_trylock_spinning - try to get console_lock by busy waiting * * This allows to busy wait for the console_lock when the current * owner is running in specially marked sections. It means that * the current owner is running and cannot reschedule until it * is ready to lose the lock. * * Return: 1 if we got the lock, 0 othrewise */ static int console_trylock_spinning(void) { struct task_struct *owner = NULL; bool waiter; bool spin = false; unsigned long flags; if (console_trylock()) return 1; printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); owner = READ_ONCE(console_owner); waiter = READ_ONCE(console_waiter); if (!waiter && owner && owner != current) { WRITE_ONCE(console_waiter, true); spin = true; } raw_spin_unlock(&console_owner_lock); /* * If there is an active printk() writing to the * consoles, instead of having it write our data too, * see if we can offload that load from the active * printer, and do some printing ourselves. * Go into a spin only if there isn't already a waiter * spinning, and there is an active printer, and * that active printer isn't us (recursive printk?). */ if (!spin) { printk_safe_exit_irqrestore(flags); return 0; } /* We spin waiting for the owner to release us */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); /* Owner will clear console_waiter on hand off */ while (READ_ONCE(console_waiter)) cpu_relax(); spin_release(&console_owner_dep_map, _THIS_IP_); printk_safe_exit_irqrestore(flags); /* * The owner passed the console lock to us. * Since we did not spin on console lock, annotate * this as a trylock. Otherwise lockdep will * complain. */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); return 1; } /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. */ static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) { static char dropped_text[64]; size_t dropped_len = 0; struct console *con; trace_console_rcuidle(text, len); if (!console_drivers) return; if (console_dropped) { dropped_len = snprintf(dropped_text, sizeof(dropped_text), "** %lu printk messages dropped **\n", console_dropped); console_dropped = 0; } for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; if (!(con->flags & CON_ENABLED)) continue; if (!con->write) continue; if (!cpu_online(smp_processor_id()) && !(con->flags & CON_ANYTIME)) continue; if (con->flags & CON_EXTENDED) con->write(con, ext_text, ext_len); else { if (dropped_len) con->write(con, dropped_text, dropped_len); con->write(con, text, len); } } } /* * Recursion is tracked separately on each CPU. If NMIs are supported, an * additional NMI context per CPU is also separately tracked. Until per-CPU * is available, a separate "early tracking" is performed. */ static DEFINE_PER_CPU(u8, printk_count); static u8 printk_count_early; #ifdef CONFIG_HAVE_NMI static DEFINE_PER_CPU(u8, printk_count_nmi); static u8 printk_count_nmi_early; #endif /* * Recursion is limited to keep the output sane. printk() should not require * more than 1 level of recursion (allowing, for example, printk() to trigger * a WARN), but a higher value is used in case some printk-internal errors * exist, such as the ringbuffer validation checks failing. */ #define PRINTK_MAX_RECURSION 3 /* * Return a pointer to the dedicated counter for the CPU+context of the * caller. */ static u8 *__printk_recursion_counter(void) { #ifdef CONFIG_HAVE_NMI if (in_nmi()) { if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count_nmi); return &printk_count_nmi_early; } #endif if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count); return &printk_count_early; } /* * Enter recursion tracking. Interrupts are disabled to simplify tracking. * The caller must check the boolean return value to see if the recursion is * allowed. On failure, interrupts are not disabled. * * @recursion_ptr must be a variable of type (u8 *) and is the same variable * that is passed to printk_exit_irqrestore(). */ #define printk_enter_irqsave(recursion_ptr, flags) \ ({ \ bool success = true; \ \ typecheck(u8 *, recursion_ptr); \ local_irq_save(flags); \ (recursion_ptr) = __printk_recursion_counter(); \ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \ local_irq_restore(flags); \ success = false; \ } else { \ (*(recursion_ptr))++; \ } \ success; \ }) /* Exit recursion tracking, restoring interrupts. */ #define printk_exit_irqrestore(recursion_ptr, flags) \ do { \ typecheck(u8 *, recursion_ptr); \ (*(recursion_ptr))--; \ local_irq_restore(flags); \ } while (0) int printk_delay_msec __read_mostly; static inline void printk_delay(void) { if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; while (m--) { mdelay(1); touch_nmi_watchdog(); } } } static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : 0x80000000 + raw_smp_processor_id(); } /** * printk_parse_prefix - Parse level and control flags. * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. * @flags: A pointer to the current printk_info flags, will be updated. * * @level may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @level must be set to * LOGLEVEL_DEFAULT in order to be updated with the parsed value. * * @flags may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @flags will be OR'd with the parsed * value. * * Return: The length of the parsed level and control flags. */ u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags) { u16 prefix_len = 0; int kern_level; while (*text) { kern_level = printk_get_level(text); if (!kern_level) break; switch (kern_level) { case '0' ... '7': if (level && *level == LOGLEVEL_DEFAULT) *level = kern_level - '0'; break; case 'c': /* KERN_CONT */ if (flags) *flags |= LOG_CONT; } prefix_len += 2; text += 2; } return prefix_len; } static u16 printk_sprint(char *text, u16 size, int facility, enum printk_info_flags *flags, const char *fmt, va_list args) { u16 text_len; text_len = vscnprintf(text, size, fmt, args); /* Mark and strip a trailing newline. */ if (text_len && text[text_len - 1] == '\n') { text_len--; *flags |= LOG_NEWLINE; } /* Strip log level and control flags. */ if (facility == 0) { u16 prefix_len; prefix_len = printk_parse_prefix(text, NULL, NULL); if (prefix_len) { text_len -= prefix_len; memmove(text, text + prefix_len, text_len); } } return text_len; } __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; unsigned long irqflags; u16 trunc_msg_len = 0; char prefix_buf[8]; u8 *recursion_ptr; u16 reserve_size; va_list args2; u16 text_len; int ret = 0; u64 ts_nsec; /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is * close to the call of printk(). This provides a more deterministic * timestamp with respect to the caller. */ ts_nsec = local_clock(); if (!printk_enter_irqsave(recursion_ptr, irqflags)) return 0; /* * The sprintf needs to come first since the syslog prefix might be * passed in as a parameter. An extra byte must be reserved so that * later the vscnprintf() into the reserved buffer has room for the * terminating '\0', which is not counted by vsnprintf(). */ va_copy(args2, args); reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; va_end(args2); if (reserve_size > LOG_LINE_MAX) reserve_size = LOG_LINE_MAX; /* Extract log level or control flags. */ if (facility == 0) printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) flags |= LOG_NEWLINE; if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); } else { prb_commit(&e); } ret = text_len; goto out; } } /* * Explicitly initialize the record before every prb_reserve() call. * prb_reserve_in_last() and prb_reserve() purposely invalidate the * structure when they fail. */ prb_rec_init_wr(&r, reserve_size); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&reserve_size, &trunc_msg_len); prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) goto out; } /* fill message */ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); ret = text_len + trunc_msg_len; out: printk_exit_irqrestore(recursion_ptr, irqflags); return ret; } asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { int printed_len; bool in_sched = false; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) return 0; if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; } boot_delay_msec(level); printk_delay(); printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to * console */ preempt_disable(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ if (console_trylock_spinning()) console_unlock(); preempt_enable(); } wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); int vprintk_default(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk(fmt, args); va_end(args); return r; } EXPORT_SYMBOL(_printk); #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 static u64 syslog_seq; static u64 console_seq; static u64 exclusive_console_stop_seq; static unsigned long console_dropped; static size_t record_print_text(const struct printk_record *r, bool syslog, bool time) { return 0; } static ssize_t info_print_ext_header(char *buf, size_t size, struct printk_info *info) { return 0; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; char buf[512]; int n; if (!early_console) return; va_start(ap, fmt); n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); early_console->write(early_console, buf, n); } #endif static int __add_preferred_console(char *name, int idx, char *options, char *brl_options, bool user_specified) { struct console_cmdline *c; int i; /* * See if this tty is not yet registered, and * if we have a slot free. */ for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { if (strcmp(c->name, name) == 0 && c->index == idx) { if (!brl_options) preferred_console = i; if (user_specified) c->user_specified = true; return 0; } } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; c->user_specified = user_specified; braille_set_options(c, brl_options); c->index = idx; return 0; } static int __init console_msg_format_setup(char *str) { if (!strcmp(str, "syslog")) console_msg_format = MSG_FORMAT_SYSLOG; if (!strcmp(str, "default")) console_msg_format = MSG_FORMAT_DEFAULT; return 1; } __setup("console_msg_format=", console_msg_format_setup); /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; /* * console="" or console=null have been suggested as a way to * disable console output. Use ttynull that has been created * for exactly this purpose. */ if (str[0] == 0 || strcmp(str, "null") == 0) { __add_preferred_console("ttynull", 0, NULL, NULL, true); return 1; } if (_braille_console_setup(&str, &brl_options)) return 1; /* * Decode str into name, index, options. */ if (str[0] >= '0' && str[0] <= '9') { strcpy(buf, "ttyS"); strncpy(buf + 4, str, sizeof(buf) - 5); } else { strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; options = strchr(str, ','); if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) strcpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; __add_preferred_console(buf, idx, options, brl_options, true); console_set_on_cmdline = 1; return 1; } __setup("console=", console_setup); /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name * @idx: device index * @options: options for this console * * The last preferred console added will be used for kernel messages * and stdin/out/err for init. Normally this is used by console_setup * above to handle user-supplied console arguments; however it can also * be used by arch-specific code either to override the user or more * commonly to provide a default console (ie from PROM variables) when * the user has not supplied one. */ int add_preferred_console(char *name, int idx, char *options) { return __add_preferred_console(name, idx, options, NULL, false); } bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); module_param_named(console_suspend, console_suspend_enabled, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(console_suspend, "suspend console during suspend" " and hibernate operations"); static bool printk_console_no_auto_verbose; void console_verbose(void) { if (console_loglevel && !printk_console_no_auto_verbose) console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } EXPORT_SYMBOL_GPL(console_verbose); module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644); MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** * suspend_console - suspend the console subsystem * * This disables printk() while we go into suspend states */ void suspend_console(void) { if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; up_console_sem(); } void resume_console(void) { if (!console_suspend_enabled) return; down_console_sem(); console_suspended = 0; console_unlock(); } /** * console_cpu_notify - print deferred console messages after CPU hotplug * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be printed on the console only if there are CON_ANYTIME consoles. * This function is called when a new CPU comes online (or fails to come * up) or goes offline. */ static int console_cpu_notify(unsigned int cpu) { if (!cpuhp_tasks_frozen) { /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); } return 0; } /** * console_lock - lock the console system for exclusive use. * * Acquires a lock which guarantees that the caller has * exclusive access to the console system and the console_drivers list. * * Can sleep, returns nothing. */ void console_lock(void) { might_sleep(); down_console_sem(); if (console_suspended) return; console_locked = 1; console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * * Try to acquire a lock which guarantees that the caller has exclusive * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ int console_trylock(void) { if (down_trylock_console_sem()) return 0; if (console_suspended) { up_console_sem(); return 0; } console_locked = 1; console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { return console_locked; } EXPORT_SYMBOL(is_console_locked); /* * Check if we have any console that is capable of printing while cpu is * booting or shutting down. Requires console_sem. */ static int have_callable_console(void) { struct console *con; for_each_console(con) if ((con->flags & CON_ENABLED) && (con->flags & CON_ANYTIME)) return 1; return 0; } /* * Can we actually use the console at this time on this cpu? * * Console drivers may assume that per-cpu resources have been allocated. So * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't * call them until this CPU is officially up. */ static inline int can_use_console(void) { return cpu_online(raw_smp_processor_id()) || have_callable_console(); } /** * console_unlock - unlock the console system * * Releases the console_lock which the caller holds on the console system * and the console driver list. * * While the console_lock was held, console output may have been buffered * by printk(). If this is the case, console_unlock(); emits * the output prior to releasing the lock. * * If there is output waiting, we wake /dev/kmsg and syslog() users. * * console_unlock(); may be called from any context. */ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[CONSOLE_LOG_MAX]; unsigned long flags; bool do_cond_resched, retry; struct printk_info info; struct printk_record r; u64 __maybe_unused next_seq; if (console_suspended) { up_console_sem(); return; } prb_rec_init_rd(&r, &info, text, sizeof(text)); /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may * end up dumping a lot of lines, for example, if called from * console registration path, and should invoke cond_resched() * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more * messages practically incapacitating the system. * * console_trylock() is not able to detect the preemptive * context reliably. Therefore the value must be stored before * and cleared after the "again" goto label. */ do_cond_resched = console_may_schedule; again: console_may_schedule = 0; /* * We released the console_sem lock, so we need to recheck if * cpu is online and (if not) is there at least one CON_ANYTIME * console. */ if (!can_use_console()) { console_locked = 0; up_console_sem(); return; } for (;;) { size_t ext_len = 0; int handover; size_t len; skip: if (!prb_read_valid(prb, console_seq, &r)) break; if (console_seq != r.info->seq) { console_dropped += r.info->seq - console_seq; console_seq = r.info->seq; } if (suppress_message_printing(r.info->level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and * record that has level above the console loglevel. */ console_seq++; goto skip; } /* Output to all consoles once old messages replayed. */ if (unlikely(exclusive_console && console_seq >= exclusive_console_stop_seq)) { exclusive_console = NULL; } /* * Handle extended console text first because later * record_print_text() will modify the record buffer in-place. */ if (nr_ext_console_drivers) { ext_len = info_print_ext_header(ext_text, sizeof(ext_text), r.info); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, &r.text_buf[0], r.info->text_len, &r.info->dev_info); } len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); console_seq++; /* * While actively printing out messages, if another printk() * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. * * Interrupts are disabled because the hand over to a waiter * must not be interrupted until the hand over is completed * (@console_waiter is cleared). */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); handover = console_lock_spinning_disable_and_check(); printk_safe_exit_irqrestore(flags); if (handover) return; if (do_cond_resched) cond_resched(); } /* Get consistent value of the next-to-be-used sequence number. */ next_seq = console_seq; console_locked = 0; up_console_sem(); /* * Someone could have filled up the buffer again, so re-check if there's * something to flush. In case we cannot trylock the console_sem again, * there's a new owner and the console_unlock() from them will do the * flush, no worries. */ retry = prb_read_valid(prb, next_seq, NULL); if (retry && console_trylock()) goto again; } EXPORT_SYMBOL(console_unlock); /** * console_conditional_schedule - yield the CPU if required * * If the console code is currently allowed to sleep, and * if this CPU should yield the CPU to another task, do * so here. * * Must be called within console_lock();. */ void __sched console_conditional_schedule(void) { if (console_may_schedule) cond_resched(); } EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { struct console *c; /* * console_unblank can no longer be called in interrupt context unless * oops_in_progress is set to 1.. */ if (oops_in_progress) { if (down_trylock_console_sem() != 0) return; } else console_lock(); console_locked = 1; console_may_schedule = 0; for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); console_unlock(); } /** * console_flush_on_panic - flush console content on panic * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ void console_flush_on_panic(enum con_flush_mode mode) { /* * If someone else is holding the console lock, trylock will fail * and may_schedule may be set. Ignore and proceed to unlock so * that messages are flushed out. As this can be called from any * context and we don't want to get preempted while flushing, * ensure may_schedule is cleared. */ console_trylock(); console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) console_seq = prb_first_valid_seq(prb); console_unlock(); } /* * Return the console tty driver structure and its associated index */ struct tty_driver *console_device(int *index) { struct console *c; struct tty_driver *driver = NULL; console_lock(); for_each_console(c) { if (!c->device) continue; driver = c->device(c, index); if (driver) break; } console_unlock(); return driver; } /* * Prevent further output on the passed console device so that (for example) * serial drivers can disable console output before suspending a port, and can * re-enable output afterwards. */ void console_stop(struct console *console) { console_lock(); console->flags &= ~CON_ENABLED; console_unlock(); } EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { console_lock(); console->flags |= CON_ENABLED; console_unlock(); } EXPORT_SYMBOL(console_start); static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; pr_info("debug: skip boot console de-registration.\n"); return 0; } early_param("keep_bootcon", keep_bootcon_setup); /* * This is called by register_console() to try to match * the newly registered console with any of the ones selected * by either the command line or add_preferred_console() and * setup/enable it. * * Care need to be taken with consoles that are statically * enabled such as netconsole */ static int try_enable_new_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { if (c->user_specified != user_specified) continue; if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && newcon->index != c->index) continue; if (newcon->index < 0) newcon->index = c->index; if (_braille_register_console(newcon, c)) return 0; if (newcon->setup && (err = newcon->setup(newcon, c->options)) != 0) return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) { newcon->flags |= CON_CONSDEV; has_preferred_console = true; } return 0; } /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; return -ENOENT; } /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. * * This can happen pretty early during the boot process (because of * early_printk) - sometimes before setup_arch() completes - be careful * of what kernel features are used - they may not be initialised yet. * * There are two types of consoles - bootconsoles (early_printk) and * "real" consoles (everything which is not a bootconsole) which are * handled differently. * - Any number of bootconsoles can be registered at any time. * - As soon as a "real" console is registered, all bootconsoles * will be unregistered automatically. * - Once a "real" console is registered, any attempt to register a * bootconsoles will be rejected */ void register_console(struct console *newcon) { struct console *bcon = NULL; int err; for_each_console(bcon) { if (WARN(bcon == newcon, "console '%s%d' already registered\n", bcon->name, bcon->index)) return; } /* * before we register a new CON_BOOT console, make sure we don't * already have a valid console */ if (newcon->flags & CON_BOOT) { for_each_console(bcon) { if (!(bcon->flags & CON_BOOT)) { pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); return; } } } if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; if (!has_preferred_console || bcon || !console_drivers) has_preferred_console = preferred_console >= 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ if (!has_preferred_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || newcon->setup(newcon, NULL) == 0) { newcon->flags |= CON_ENABLED; if (newcon->device) { newcon->flags |= CON_CONSDEV; has_preferred_console = true; } } } /* See if this console matches one we selected on the command line */ err = try_enable_new_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) err = try_enable_new_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) return; /* * If we have a bootconsole, and are switching to a real console, * don't print everything out again, since when the boot console, and * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) newcon->flags &= ~CON_PRINTBUFFER; /* * Put this console in the list - keep the * preferred driver at the head of the list. */ console_lock(); if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { newcon->next = console_drivers; console_drivers = newcon; if (newcon->next) newcon->next->flags &= ~CON_CONSDEV; /* Ensure this flag is always set for the head of the list */ newcon->flags |= CON_CONSDEV; } else { newcon->next = console_drivers->next; console_drivers->next = newcon; } if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; if (newcon->flags & CON_PRINTBUFFER) { /* * console_unlock(); will print out the buffered messages * for us. * * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to * the already-registered consoles. * * Set exclusive_console with disabled interrupts to reduce * race window with eventual console_flush_on_panic() that * ignores console_lock. */ exclusive_console = newcon; exclusive_console_stop_seq = console_seq; /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); console_seq = syslog_seq; mutex_unlock(&syslog_lock); } console_unlock(); console_sysfs_notify(); /* * By unregistering the bootconsoles after we enable the real console * we get the "console xxx enabled" message on all the consoles - * boot consoles, real consoles, etc - this is to ensure that end * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ pr_info("%sconsole [%s%d] enabled\n", (newcon->flags & CON_BOOT) ? "boot" : "" , newcon->name, newcon->index); if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { /* We need to iterate through all boot consoles, to make * sure we print everything out, before we unregister them. */ for_each_console(bcon) if (bcon->flags & CON_BOOT) unregister_console(bcon); } } EXPORT_SYMBOL(register_console); int unregister_console(struct console *console) { struct console *con; int res; pr_info("%sconsole [%s%d] disabled\n", (console->flags & CON_BOOT) ? "boot" : "" , console->name, console->index); res = _braille_unregister_console(console); if (res < 0) return res; if (res > 0) return 0; res = -ENODEV; console_lock(); if (console_drivers == console) { console_drivers=console->next; res = 0; } else { for_each_console(con) { if (con->next == console) { con->next = console->next; res = 0; break; } } } if (res) goto out_disable_unlock; if (console->flags & CON_EXTENDED) nr_ext_console_drivers--; /* * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. */ if (console_drivers != NULL && console->flags & CON_CONSDEV) console_drivers->flags |= CON_CONSDEV; console->flags &= ~CON_ENABLED; console_unlock(); console_sysfs_notify(); if (console->exit) res = console->exit(console); return res; out_disable_unlock: console->flags &= ~CON_ENABLED; console_unlock(); return res; } EXPORT_SYMBOL(unregister_console); /* * Initialize the console device. This is called *early*, so * we can't necessarily depend on lots of kernel help here. * Just do some early initializations, and do the complex setup * later. */ void __init console_init(void) { int ret; initcall_t call; initcall_entry_t *ce; /* Setup the default TTY line discipline. */ n_tty_init(); /* * set up the console device so that later boot sequences can * inform about problems etc.. */ ce = __con_initcall_start; trace_initcall_level("console"); while (ce < __con_initcall_end) { call = initcall_from_entry(ce); trace_initcall_start(call); ret = call(); trace_initcall_finish(call, ret); ce++; } } /* * Some boot consoles access data that is in the init section and which will * be discarded after the initcalls have been run. To make sure that no code * will access this data, unregister the boot consoles in a late initcall. * * If for some reason, such as deferred probe or the driver being a loadable * module, the real console hasn't registered yet at this point, there will * be a brief interval in which no messages are logged to the console, which * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory * intersects with the init section. Note that all other boot consoles will * get unregistered when the real preferred console is registered. */ static int __init printk_late_init(void) { struct console *con; int ret; for_each_console(con) { if (!(con->flags & CON_BOOT)) continue; /* Check addresses that might be used for enabled consoles. */ if (init_section_intersects(con, sizeof(*con)) || init_section_contains(con->write, 0) || init_section_contains(con->read, 0) || init_section_contains(con->device, 0) || init_section_contains(con->unblank, 0) || init_section_contains(con->data, 0)) { /* * Please, consider moving the reported consoles out * of the init section. */ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", con->name, con->index); unregister_console(con); } } ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); return 0; } late_initcall(printk_late_init); #if defined CONFIG_PRINTK /* * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_PENDING_WAKEUP 0x01 #define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); } if (pending & PRINTK_PENDING_WAKEUP) wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the wait queue is empty. * * The full memory barrier within wq_has_sleeper() pairs with the full * memory barrier within set_current_state() of * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * * This pairs with devkmsg_read:A and syslog_print:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ (val & PRINTK_PENDING_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } void defer_console_output(void) { /* * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) { defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) { int r; r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); defer_console_output(); return r; } int _printk_deferred(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_deferred(fmt, args); va_end(args); return r; } /* * printk rate limiting, lifted from the networking subsystem. * * This enforces a rate limit: not more than 10 kernel messages * every 5s to make a denial-of-service attack impossible. */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); int __printk_ratelimit(const char *func) { return ___ratelimit(&printk_ratelimit_state, func); } EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state * @interval_msecs: minimum interval between prints * * printk_timed_ratelimit() returns true if more than @interval_msecs * milliseconds have elapsed since the last time printk_timed_ratelimit() * returned true. */ bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { unsigned long elapsed = jiffies - *caller_jiffies; if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) return false; *caller_jiffies = jiffies; return true; } EXPORT_SYMBOL(printk_timed_ratelimit); static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. */ int kmsg_dump_register(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EBUSY; /* The dump callback needs to be set */ if (!dumper->dump) return -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. */ int kmsg_dump_unregister(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) { switch (reason) { case KMSG_DUMP_PANIC: return "Panic"; case KMSG_DUMP_OOPS: return "Oops"; case KMSG_DUMP_EMERG: return "Emergency"; case KMSG_DUMP_SHUTDOWN: return "Shutdown"; default: return "Unknown"; } } EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * * Call each of the registered dumper's dump() callback, which can * retrieve the kmsg records with kmsg_dump_get_line() or * kmsg_dump_get_buffer(). */ void kmsg_dump(enum kmsg_dump_reason reason) { struct kmsg_dumper *dumper; rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { enum kmsg_dump_reason max_reason = dumper->max_reason; /* * If client has not provided a specific max_reason, default * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. */ if (max_reason == KMSG_DUMP_UNDEF) { max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : KMSG_DUMP_OOPS; } if (reason > max_reason) continue; /* invoke dumper which will iterate over records */ dumper->dump(dumper, reason); } rcu_read_unlock(); } /** * kmsg_dump_get_line - retrieve one kmsg log line * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * * Start at the beginning of the kmsg buffer, with the oldest kmsg * record, and copy one record into the provided buffer. * * Consecutive calls will return the next available record moving * towards the end of the buffer with the youngest messages. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; unsigned int line_count; struct printk_record r; size_t l = 0; bool ret = false; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ if (line) { if (!prb_read_valid(prb, iter->cur_seq, &r)) goto out; l = record_print_text(&r, syslog, printk_time); } else { if (!prb_read_valid_info(prb, iter->cur_seq, &info, &line_count)) { goto out; } l = get_record_print_text_size(&info, line_count, syslog, printk_time); } iter->cur_seq = r.info->seq + 1; ret = true; out: if (len) *len = l; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** * kmsg_dump_get_buffer - copy kmsg log lines * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len_out: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * * Consecutive calls will fill the buffer with the next block of * available older records, not including the earlier retrieved ones. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, char *buf, size_t size, size_t *len_out) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; struct printk_record r; u64 seq; u64 next_seq; size_t len = 0; bool ret = false; bool time = printk_time; if (!buf || !size) goto out; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ iter->cur_seq = info.seq; } } /* last entry */ if (iter->cur_seq >= iter->next_seq) goto out; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. Pass in size-1 * because this function (by way of record_print_text()) will * not write more than size-1 bytes of text into @buf. */ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, size - 1, syslog, time); /* * Next kmsg_dump_get_buffer() invocation will dump block of * older records stored right before this one. */ next_seq = seq; prb_rec_init_rd(&r, &info, buf, size); len = 0; prb_for_each_record(seq, prb, seq, &r) { if (r.info->seq >= iter->next_seq) break; len += record_print_text(&r, syslog, time); /* Adjust record to store to remaining buffer space. */ prb_rec_init_rd(&r, &info, buf + len, size - len); } iter->next_seq = next_seq; ret = true; out: if (len_out) *len_out = len; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** * kmsg_dump_rewind - reset the iterator * @iter: kmsg dump iterator * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. */ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { iter->cur_seq = latched_seq_read_nolock(&clear_seq); iter->next_seq = prb_next_seq(prb); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif #ifdef CONFIG_SMP static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); /** * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant * spinning lock is not owned by any CPU. * * Context: Any context. */ void __printk_wait_on_cpu_lock(void) { do { cpu_relax(); } while (atomic_read(&printk_cpulock_owner) != -1); } EXPORT_SYMBOL(__printk_wait_on_cpu_lock); /** * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the * lock, this function succeeds immediately. * * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ int __printk_cpu_trylock(void) { int cpu; int old; cpu = smp_processor_id(); /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with * __printk_cpu_unlock:B. * * Memory barrier involvement: * * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B. * * Relies on: * * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B * of the previous CPU * matching * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B * of this CPU */ old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1, cpu); /* LMM(__printk_cpu_trylock:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing * data: LMM(__printk_cpu_trylock:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ atomic_inc(&printk_cpulock_nested); return 1; } return 0; } EXPORT_SYMBOL(__printk_cpu_trylock); /** * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ void __printk_cpu_unlock(void) { if (atomic_read(&printk_cpulock_nested)) { atomic_dec(&printk_cpulock_nested); return; } /* * This CPU is finished loading/storing data: * LMM(__printk_cpu_unlock:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs * with __printk_cpu_trylock:A. * * Memory barrier involvement: * * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A. * * Relies on: * * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B * of this CPU * matching * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B * of the next CPU */ atomic_set_release(&printk_cpulock_owner, -1); /* LMM(__printk_cpu_unlock:B) */ } EXPORT_SYMBOL(__printk_cpu_unlock); #endif /* CONFIG_SMP */
268 204 204 204 268 268 268 6 6 6 128 2 128 128 128 215 83 204 204 204 204 204 204 5 203 234 234 141 234 234 226 234 229 204 172 170 168 225 224 224 198 128 16 301 301 16 16 221 221 6 6 3 3 6 221 200 221 221 221 221 291 171 291 289 289 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 // SPDX-License-Identifier: GPL-2.0-only /* * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. * * 10Apr2002 Andrew Morton * Initial version */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/percpu.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> #include <linux/sched/signal.h> #include <linux/mm_inline.h> #include <trace/events/writeback.h> #include "internal.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* * Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* * Estimate write bandwidth at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) #define RATELIMIT_CALC_SHIFT 10 /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; /* The following parameters are exported via /proc/sys/vm */ /* * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ EXPORT_SYMBOL_GPL(dirty_writeback_interval); /* * The longest time for which data is allowed to remain dirty */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; /* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ #endif struct bdi_writeback *wb; struct fprop_local_percpu *wb_completions; unsigned long avail; /* dirtyable */ unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; unsigned long wb_bg_thresh; unsigned long pos_ratio; }; /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) #ifdef CONFIG_CGROUP_WRITEBACK #define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB .dom = &global_wb_domain #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; min = div64_ul(min, tot_bw); } if (max < 100) { max *= this_bw; max = div64_ul(max, tot_bw); } } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ #define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB #define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ /** * node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node * * Return: the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. */ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) { unsigned long nr_pages = 0; int z; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; nr_pages += zone_page_state(zone, NR_FREE_PAGES); } /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ nr_pages -= min(nr_pages, pgdat->totalreserve_pages); nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); return nr_pages; } static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; int i; for_each_node_state(node, N_HIGH_MEMORY) { for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; unsigned long nr_pages; if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; if (!populated_zone(z)) continue; nr_pages = zone_page_state(z, NR_FREE_PAGES); /* watch for underflows */ nr_pages -= min(nr_pages, high_wmark_pages(z)); nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); x += nr_pages; } } /* * Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below * the zone's dirty balance reserve and the above calculation * will underflow. However we still want to add in nodes * which are below threshold (negative values) to get a more * accurate calculation but make sure that the total never * underflows. */ if ((long)x < 0) x = 0; /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** * global_dirtyable_memory - number of globally dirtyable pages * * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) { unsigned long x; x = global_zone_page_state(NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ x -= min(x, totalreserve_pages); x += global_node_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } /** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest * * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; /* convert ratios to per-PAGE_SIZE for higher precision */ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; unsigned long thresh; unsigned long bg_thresh; struct task_struct *tsk; /* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against * globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. */ if (bytes) ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); if (bg_bytes) bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else thresh = (ratio * available_memory) / PAGE_SIZE; if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; if (bg_thresh >= thresh) bg_thresh = thresh / 2; tsk = current; if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) trace_global_dirty_state(bg_thresh, thresh); } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; } /** * node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node * * Return: the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. */ static unsigned long node_dirty_limit(struct pglist_data *pgdat) { unsigned long node_memory = node_dirtyable_memory(pgdat); struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * node_memory / global_dirtyable_memory(); else dirty = vm_dirty_ratio * node_memory / 100; if (rt_task(tsk)) dirty += dirty / 4; return dirty; } /** * node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check * * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ bool node_dirty_ok(struct pglist_data *pgdat) { unsigned long limit = node_dirty_limit(pgdat); unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; } int dirty_background_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { writeback_set_ratelimit(); vm_dirty_bytes = 0; } return ret; } int dirty_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } static void wb_domain_writeout_inc(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac) { __fprop_inc_percpu_max(&dom->completions, completions, max_prop_frac); /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); } } /* * Increment @wb's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { struct wb_domain *cgdom; inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac); } void wb_writeout_inc(struct bdi_writeback *wb) { unsigned long flags; local_irq_save(flags); __wb_writeout_inc(wb); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ static void writeout_period(struct timer_list *t) { struct wb_domain *dom = from_timer(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ dom->period_time = 0; } } int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); spin_lock_init(&dom->lock); timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); dom->dirty_limit_tstamp = jiffies; return fprop_global_init(&dom->completions, gfp); } #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { del_timer_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. */ static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { min_ratio -= bdi->min_ratio; if (bdi_min_ratio + min_ratio < 100) { bdi_min_ratio += min_ratio; bdi->min_ratio += min_ratio; } else { ret = -EINVAL; } } spin_unlock_bh(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { int ret = 0; if (max_ratio > 100) return -EINVAL; spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; } spin_unlock_bh(&bdi_lock); return ret; } EXPORT_SYMBOL(bdi_set_max_ratio); static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) { return max(thresh, dom->dirty_limit); } /* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) { struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); mdtc->avail = filepages + min(headroom, other_clean); } /** * __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest * * Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty and PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; u64 wb_thresh; unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* * Calculate this BDI's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / 100; if (wb_thresh > (thresh * wb_max_ratio) / 100) wb_thresh = thresh * wb_max_ratio / 100; return wb_thresh; } unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb), .thresh = thresh }; return __wb_calc_thresh(&gdtc); } /* * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, (limit - setpoint) | 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* * Dirty position control. * * (o) global/bdi setpoints * * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * * if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 * * * * * * * * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * * (o) wb control line * * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> * wb_setpoint^ x_intercept^ * * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD * card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload */ static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; dtc->pos_ratio = 0; if (unlikely(dtc->dirty >= limit)) return; /* * global setpoint * * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For * such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: wb_dirty and wb_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if * wb_dirty is under wb_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations * because we want to throttle process writing to a strictlimit wb * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; if (dtc->wb_dirty < 8) { dtc->pos_ratio = min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); return; } if (dtc->wb_dirty >= wb_thresh) return; wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); if (wb_setpoint == 0 || wb_setpoint == wb_thresh) return; wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, wb_thresh); /* * Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to * make decision based on wb counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other * wb's) while given strictlimit wb is below limit. * * "pos_ratio * wb_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all * activity in the system coming from a single strictlimit wb * with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 * (when globally we are at freerun and wb is well below wb * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; } /* * We have computed basic pos_ratio above based on global situation. If * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* * wb setpoint * * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * * x_intercept - wb_dirty * := -------------------------- * x_intercept - wb_setpoint * * The main wb control line is a linear function that subjects to * * (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw * * For single wb case, the dirty pages are observed to fluctuate * regularly within range * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* * It's very possible that wb_thresh is close to 0 not because the * device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); wb_setpoint = setpoint * (u64)x >> 16; /* * Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * * wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh */ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; x_intercept = wb_setpoint + span; if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ x_intercept = wb_thresh / 2; if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); else pos_ratio *= 8; } dtc->pos_ratio = pos_ratio; } static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period * * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { bw = div64_ul(bw, elapsed); avg = bw; goto out; } bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; WRITE_ONCE(wb->avg_write_bandwidth, avg); } static void update_dirty_limit(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; unsigned long limit = dom->dirty_limit; /* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: dom->dirty_limit = limit; } static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { update_dirty_limit(dtc); dom->dirty_limit_tstamp = now; } spin_unlock(&dom->lock); } /* * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long dirtied, unsigned long elapsed) { struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); /* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; /* * We could safely do this and return immediately: * * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; /* * For strictlimit case, calculations above were based on wb counters * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". * * We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; if (dtc->wb_dirty < 8) setpoint = dtc->wb_dirty + 1; else setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; unsigned long elapsed; unsigned long dirtied; unsigned long written; spin_lock(&wb->list_lock); /* * Lockless checks for elapsed time are racy and delayed update after * IO completion doesn't do it at all (to make sure written pages are * accounted reasonably quickly). Make sure elapsed >= 1 to avoid * division errors. */ elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); if (update_ratelimit) { domain_update_dirty_limit(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_dirty_limit(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } wb_update_write_bandwidth(wb, elapsed, written); wb->dirtied_stamp = dirtied; wb->written_stamp = written; WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; __wb_update_bandwidth(&gdtc, NULL, false); } /* Interval after which we consider wb idle and don't estimate bandwidth */ #define WB_BANDWIDTH_IDLE_JIF (HZ) static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) { unsigned long now = jiffies; unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); if (elapsed > WB_BANDWIDTH_IDLE_JIF && !atomic_read(&wb->writeback_inodes)) { spin_lock(&wb->list_lock); wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); wb->written_stamp = wb_stat(wb, WB_WRITTEN); WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } } /* * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive * global_zone_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; return min_t(unsigned long, t, MAX_PAUSE); } static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ /* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); /* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * * (N * 10ms) on 2^N concurrent tasks. */ if (hi > lo) t += (hi - lo) * (10 * HZ) / 1024; /* * This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. * * We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. */ t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); /* * Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. */ if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } *nr_dirtied_pause = pages; /* * The minimal pause time will normally be half the target pause time. */ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long wb_reclaimable; /* * wb_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, wb_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ dtc->wb_thresh = __wb_calc_thresh(dtc); dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; unsigned long nr_reclaimable; /* = file_dirty */ long period; long pause; long max_pause; long min_pause; int nr_dirtied_pause; bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; unsigned long dirty, thresh, bg_thresh; unsigned long m_dirty = 0; /* stop bogus uninit warnings */ unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); domain_dirty_limits(gdtc); if (unlikely(strictlimit)) { wb_dirty_limits(gdtc); dirty = gdtc->wb_dirty; thresh = gdtc->wb_thresh; bg_thresh = gdtc->wb_bg_thresh; } else { dirty = gdtc->dirty; thresh = gdtc->thresh; bg_thresh = gdtc->bg_thresh; } if (mdtc) { unsigned long filepages, headroom, writeback; /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc->dirty += writeback; mdtc_calc_avail(mdtc, filepages, headroom); domain_dirty_limits(mdtc); if (unlikely(strictlimit)) { wb_dirty_limits(mdtc); m_dirty = mdtc->wb_dirty; m_thresh = mdtc->wb_thresh; m_bg_thresh = mdtc->wb_bg_thresh; } else { m_dirty = mdtc->dirty; m_thresh = mdtc->thresh; m_bg_thresh = mdtc->bg_thresh; } } /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the wb limits are ramping up in case of !strictlimit. * * In strictlimit case make decision based on the wb counters * and limits. Small writeouts when the wb limits are ramping * up are the price we consciously pay for strictlimit-ing. * * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. */ if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { unsigned long intv; unsigned long m_intv; free_running: intv = dirty_poll_interval(dirty, thresh); m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); break; } if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); mem_cgroup_flush_foreign(wb); /* * Calculate global domain's pos_ratio and select the * global dtc by default. */ if (!strictlimit) { wb_dirty_limits(gdtc); if ((current->flags & PF_LOCAL_THROTTLE) &&